wp2txt 0.9.5 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +134 -57
- data/bin/wp2txt +149 -95
- data/data/output_samples/testdata_en.txt +171 -1247
- data/data/output_samples/{testdata_en_categories.txt → testdata_en_category.txt} +1 -1
- data/data/output_samples/testdata_en_summary.txt +28 -20
- data/data/output_samples/testdata_ja.txt +10359 -17093
- data/data/output_samples/{testdata_ja_categories.txt → testdata_ja_category.txt} +30 -30
- data/data/output_samples/testdata_ja_summary.txt +36 -160
- data/image/screenshot.png +0 -0
- data/image/wp2txt-logo.svg +16 -0
- data/image/wp2txt.svg +31 -0
- data/lib/wp2txt/article.rb +1 -3
- data/lib/wp2txt/utils.rb +92 -68
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +154 -171
- data/spec/utils_spec.rb +3 -21
- data/wp2txt.gemspec +7 -3
- metadata +54 -12
- data/bin/benchmark.rb +0 -76
- data/lib/wp2txt/mw_api.rb +0 -65
- data/lib/wp2txt/progressbar.rb +0 -305
data/lib/wp2txt/utils.rb
CHANGED
@@ -77,6 +77,22 @@ $double_square_bracket_regex = Regexp.new("(#{Regexp.escape('[[')}|#{Regexp.esca
|
|
77
77
|
$single_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{')}|#{Regexp.escape('}')})", Regexp::MULTILINE)
|
78
78
|
$double_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{{')}|#{Regexp.escape('}}')})", Regexp::MULTILINE)
|
79
79
|
$curly_square_bracket_regex = Regexp.new("(#{Regexp.escape('{|')}|#{Regexp.escape('|}')})", Regexp::MULTILINE)
|
80
|
+
|
81
|
+
$complex_regex_01 = Regexp.new('\<\<([^<>]++)\>\>\s?')
|
82
|
+
$complex_regex_02 = Regexp.new('\[\[File\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]', Regexp::MULTILINE | Regexp::IGNORECASE)
|
83
|
+
$complex_regex_03 = Regexp.new('^\[\[((?:[^\[\]]++|\[\[\g<1>\]\])++)^\]\]', Regexp::MULTILINE)
|
84
|
+
$complex_regex_04 = Regexp.new('\{\{(?:infobox|efn|sfn|unreliable source|refn|reflist|col(?:umns)?\-list|div col|no col|bar box|formatnum\:|col\||see also\||r\||#)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
|
85
|
+
$complex_regex_05 = Regexp.new('\{\{[^{}]+?\n\|((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
|
86
|
+
|
87
|
+
$cleanup_regex_01 = Regexp.new('\[ref\]\s*\[\/ref\]', Regexp::MULTILINE)
|
88
|
+
$cleanup_regex_02 = Regexp.new('^File:.+$')
|
89
|
+
$cleanup_regex_03 = Regexp.new('^\|.*$')
|
90
|
+
$cleanup_regex_04 = Regexp.new('\{\{.*$')
|
91
|
+
$cleanup_regex_05 = Regexp.new('^.*\}\}')
|
92
|
+
$cleanup_regex_06 = Regexp.new('\{\|.*$')
|
93
|
+
$cleanup_regex_07 = Regexp.new('^.*\|\}')
|
94
|
+
$cleanup_regex_08 = Regexp.new('\n\n\n+', Regexp::MULTILINE)
|
95
|
+
|
80
96
|
###################################################
|
81
97
|
|
82
98
|
module Wp2txt
|
@@ -104,11 +120,12 @@ module Wp2txt
|
|
104
120
|
end
|
105
121
|
|
106
122
|
def format_wiki!(text, has_retried = false)
|
123
|
+
remove_complex!(text)
|
124
|
+
|
107
125
|
escape_nowiki!(text)
|
108
126
|
process_interwiki_links!(text)
|
109
127
|
process_external_links!(text)
|
110
128
|
unescape_nowiki!(text)
|
111
|
-
|
112
129
|
remove_directive!(text)
|
113
130
|
remove_emphasis!(text)
|
114
131
|
mndash!(text)
|
@@ -120,61 +137,64 @@ module Wp2txt
|
|
120
137
|
end
|
121
138
|
|
122
139
|
def cleanup!(text)
|
123
|
-
text.gsub!(
|
124
|
-
text.gsub!(
|
125
|
-
text.gsub!(
|
126
|
-
text.gsub!(
|
127
|
-
text.gsub!(
|
128
|
-
text.gsub!(
|
140
|
+
text.gsub!($cleanup_regex_01){""}
|
141
|
+
text.gsub!($cleanup_regex_02){""}
|
142
|
+
text.gsub!($cleanup_regex_03){""}
|
143
|
+
text.gsub!($cleanup_regex_04){""}
|
144
|
+
text.gsub!($cleanup_regex_05){""}
|
145
|
+
text.gsub!($cleanup_regex_06){""}
|
146
|
+
text.gsub!($cleanup_regex_07){""}
|
147
|
+
text.gsub!($cleanup_regex_08){"\n\n"}
|
129
148
|
text.strip!
|
130
149
|
text << "\n\n"
|
131
150
|
end
|
151
|
+
|
132
152
|
#################### parser for nested structure ####################
|
133
153
|
|
134
154
|
def process_nested_structure(scanner, left, right, &block)
|
135
155
|
test = false
|
136
156
|
buffer = ""
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
buffer << str
|
155
|
-
has_left = true
|
156
|
-
when right
|
157
|
-
if has_left
|
158
|
-
buffer = buffer[0...-(left.size)]
|
159
|
-
contents = block.call(str[0...-(left.size)])
|
160
|
-
buffer << contents
|
161
|
-
break
|
162
|
-
else
|
157
|
+
begin
|
158
|
+
if left == "[" && right == "]"
|
159
|
+
regex = $single_square_bracket_regex
|
160
|
+
elsif left == "[[" && right == "]]"
|
161
|
+
regex = $double_square_bracket_regex
|
162
|
+
elsif left == "{" && right == "}"
|
163
|
+
regex = $single_curly_bracket_regex
|
164
|
+
elsif left == "{{" && right == "}}"
|
165
|
+
regex = $double_curly_bracket_regex
|
166
|
+
elsif left == "{|" && right == "|}"
|
167
|
+
regex = $curly_square_bracket_regex
|
168
|
+
else
|
169
|
+
regex = Regexp.new("(#{Regexp.escape(left)}|#{Regexp.escape(right)})")
|
170
|
+
end
|
171
|
+
while str = scanner.scan_until(regex)
|
172
|
+
case scanner[1]
|
173
|
+
when left
|
163
174
|
buffer << str
|
175
|
+
has_left = true
|
176
|
+
when right
|
177
|
+
if has_left
|
178
|
+
buffer = buffer[0...-(left.size)]
|
179
|
+
contents = block.call(str[0...-(left.size)])
|
180
|
+
buffer << contents
|
181
|
+
break
|
182
|
+
else
|
183
|
+
buffer << str
|
184
|
+
end
|
164
185
|
end
|
165
186
|
end
|
166
|
-
|
167
|
-
buffer << scanner.rest
|
187
|
+
buffer << scanner.rest
|
168
188
|
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
189
|
+
if buffer == scanner.string
|
190
|
+
return buffer
|
191
|
+
else
|
192
|
+
scanner.string = buffer
|
193
|
+
return process_nested_structure(scanner, left, right, &block) || ""
|
194
|
+
end
|
195
|
+
rescue => e
|
196
|
+
return scanner.string
|
174
197
|
end
|
175
|
-
# rescue => e
|
176
|
-
# return scanner.string
|
177
|
-
# end
|
178
198
|
end
|
179
199
|
|
180
200
|
#################### methods used from format_wiki ####################
|
@@ -217,12 +237,16 @@ module Wp2txt
|
|
217
237
|
def process_external_links!(str)
|
218
238
|
scanner = StringScanner.new(str)
|
219
239
|
result = process_nested_structure(scanner, "[", "]") do |contents|
|
220
|
-
|
221
|
-
|
222
|
-
when 1
|
223
|
-
parts.first || ""
|
240
|
+
if /\A\s.+\s\z/ =~ contents
|
241
|
+
" (#{contents.strip}) "
|
224
242
|
else
|
225
|
-
parts
|
243
|
+
parts = contents.split(" ", 2)
|
244
|
+
case parts.size
|
245
|
+
when 1
|
246
|
+
parts.first || ""
|
247
|
+
else
|
248
|
+
parts.last || ""
|
249
|
+
end
|
226
250
|
end
|
227
251
|
end
|
228
252
|
str.replace(result)
|
@@ -239,10 +263,6 @@ module Wp2txt
|
|
239
263
|
result = process_nested_structure(scanner, "{", "}") do |contents|
|
240
264
|
""
|
241
265
|
end
|
242
|
-
scanner = StringScanner.new(result)
|
243
|
-
result = process_nested_structure(scanner, "{{", "}}") do |contents|
|
244
|
-
""
|
245
|
-
end
|
246
266
|
str.replace(result)
|
247
267
|
end
|
248
268
|
|
@@ -310,7 +330,8 @@ module Wp2txt
|
|
310
330
|
end
|
311
331
|
|
312
332
|
def remove_html!(str)
|
313
|
-
["
|
333
|
+
str.gsub!(/<[^<>]+\/>/){""}
|
334
|
+
["div", "gallery", "timeline", "noinclude"].each do |tag|
|
314
335
|
scanner = StringScanner.new(str)
|
315
336
|
result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do |contents|
|
316
337
|
""
|
@@ -320,11 +341,11 @@ module Wp2txt
|
|
320
341
|
end
|
321
342
|
|
322
343
|
def remove_complex!(str)
|
323
|
-
str.gsub!(
|
324
|
-
str.gsub!(
|
325
|
-
str.gsub!(
|
326
|
-
str.gsub!(
|
327
|
-
str.gsub!(
|
344
|
+
str.gsub!($complex_regex_01){"《#{$1}》"}
|
345
|
+
str.gsub!($complex_regex_02){""}
|
346
|
+
str.gsub!($complex_regex_03){""}
|
347
|
+
str.gsub!($complex_regex_04){""}
|
348
|
+
str.gsub!($complex_regex_05){""}
|
328
349
|
end
|
329
350
|
|
330
351
|
def make_reference!(str)
|
@@ -340,6 +361,8 @@ module Wp2txt
|
|
340
361
|
parts = contents.split("|")
|
341
362
|
if /\A(?:lang|fontsize)\z/i =~ parts[0]
|
342
363
|
parts.shift
|
364
|
+
elsif /\Alang\-/i =~ parts[0]
|
365
|
+
parts.shift
|
343
366
|
elsif /\Alang=/i =~ parts[1]
|
344
367
|
parts.shift
|
345
368
|
end
|
@@ -347,10 +370,14 @@ module Wp2txt
|
|
347
370
|
if parts.size == 1
|
348
371
|
out = parts[0]
|
349
372
|
else
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
373
|
+
begin
|
374
|
+
keyval = parts[1].split("=")
|
375
|
+
if keyval.size > 1
|
376
|
+
out = keyval[1]
|
377
|
+
else
|
378
|
+
out = parts[1] || ""
|
379
|
+
end
|
380
|
+
rescue
|
354
381
|
out = parts[1] || ""
|
355
382
|
end
|
356
383
|
end
|
@@ -418,7 +445,7 @@ module Wp2txt
|
|
418
445
|
end
|
419
446
|
end
|
420
447
|
|
421
|
-
def rename(files)
|
448
|
+
def rename(files, ext = "txt")
|
422
449
|
# num of digits necessary to name the last file generated
|
423
450
|
maxwidth = 0
|
424
451
|
|
@@ -431,8 +458,9 @@ module Wp2txt
|
|
431
458
|
newname= f.sub(/\-(\d+)\z/) do
|
432
459
|
"-" + sprintf("%0#{maxwidth}d", $1.to_i)
|
433
460
|
end
|
434
|
-
File.rename(f, newname + "
|
461
|
+
File.rename(f, newname + ".#{ext}")
|
435
462
|
end
|
463
|
+
return true
|
436
464
|
end
|
437
465
|
|
438
466
|
# convert int of seconds to string in the format 00:00:00
|
@@ -448,8 +476,4 @@ module Wp2txt
|
|
448
476
|
return str
|
449
477
|
end
|
450
478
|
|
451
|
-
def decimal_format(i)
|
452
|
-
str = i.to_s.reverse
|
453
|
-
return str.scan(/.?.?./).join(',').reverse
|
454
|
-
end
|
455
479
|
end
|
data/lib/wp2txt/version.rb
CHANGED