wp2txt 0.9.5 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +134 -57
- data/bin/wp2txt +149 -95
- data/data/output_samples/testdata_en.txt +171 -1247
- data/data/output_samples/{testdata_en_categories.txt → testdata_en_category.txt} +1 -1
- data/data/output_samples/testdata_en_summary.txt +28 -20
- data/data/output_samples/testdata_ja.txt +10359 -17093
- data/data/output_samples/{testdata_ja_categories.txt → testdata_ja_category.txt} +30 -30
- data/data/output_samples/testdata_ja_summary.txt +36 -160
- data/image/screenshot.png +0 -0
- data/image/wp2txt-logo.svg +16 -0
- data/image/wp2txt.svg +31 -0
- data/lib/wp2txt/article.rb +1 -3
- data/lib/wp2txt/utils.rb +92 -68
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +154 -171
- data/spec/utils_spec.rb +3 -21
- data/wp2txt.gemspec +7 -3
- metadata +54 -12
- data/bin/benchmark.rb +0 -76
- data/lib/wp2txt/mw_api.rb +0 -65
- data/lib/wp2txt/progressbar.rb +0 -305
data/lib/wp2txt/utils.rb
CHANGED
@@ -77,6 +77,22 @@ $double_square_bracket_regex = Regexp.new("(#{Regexp.escape('[[')}|#{Regexp.esca
|
|
77
77
|
$single_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{')}|#{Regexp.escape('}')})", Regexp::MULTILINE)
|
78
78
|
$double_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{{')}|#{Regexp.escape('}}')})", Regexp::MULTILINE)
|
79
79
|
$curly_square_bracket_regex = Regexp.new("(#{Regexp.escape('{|')}|#{Regexp.escape('|}')})", Regexp::MULTILINE)
|
80
|
+
|
81
|
+
$complex_regex_01 = Regexp.new('\<\<([^<>]++)\>\>\s?')
|
82
|
+
$complex_regex_02 = Regexp.new('\[\[File\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]', Regexp::MULTILINE | Regexp::IGNORECASE)
|
83
|
+
$complex_regex_03 = Regexp.new('^\[\[((?:[^\[\]]++|\[\[\g<1>\]\])++)^\]\]', Regexp::MULTILINE)
|
84
|
+
$complex_regex_04 = Regexp.new('\{\{(?:infobox|efn|sfn|unreliable source|refn|reflist|col(?:umns)?\-list|div col|no col|bar box|formatnum\:|col\||see also\||r\||#)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
|
85
|
+
$complex_regex_05 = Regexp.new('\{\{[^{}]+?\n\|((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
|
86
|
+
|
87
|
+
$cleanup_regex_01 = Regexp.new('\[ref\]\s*\[\/ref\]', Regexp::MULTILINE)
|
88
|
+
$cleanup_regex_02 = Regexp.new('^File:.+$')
|
89
|
+
$cleanup_regex_03 = Regexp.new('^\|.*$')
|
90
|
+
$cleanup_regex_04 = Regexp.new('\{\{.*$')
|
91
|
+
$cleanup_regex_05 = Regexp.new('^.*\}\}')
|
92
|
+
$cleanup_regex_06 = Regexp.new('\{\|.*$')
|
93
|
+
$cleanup_regex_07 = Regexp.new('^.*\|\}')
|
94
|
+
$cleanup_regex_08 = Regexp.new('\n\n\n+', Regexp::MULTILINE)
|
95
|
+
|
80
96
|
###################################################
|
81
97
|
|
82
98
|
module Wp2txt
|
@@ -104,11 +120,12 @@ module Wp2txt
|
|
104
120
|
end
|
105
121
|
|
106
122
|
def format_wiki!(text, has_retried = false)
|
123
|
+
remove_complex!(text)
|
124
|
+
|
107
125
|
escape_nowiki!(text)
|
108
126
|
process_interwiki_links!(text)
|
109
127
|
process_external_links!(text)
|
110
128
|
unescape_nowiki!(text)
|
111
|
-
|
112
129
|
remove_directive!(text)
|
113
130
|
remove_emphasis!(text)
|
114
131
|
mndash!(text)
|
@@ -120,61 +137,64 @@ module Wp2txt
|
|
120
137
|
end
|
121
138
|
|
122
139
|
def cleanup!(text)
|
123
|
-
text.gsub!(
|
124
|
-
text.gsub!(
|
125
|
-
text.gsub!(
|
126
|
-
text.gsub!(
|
127
|
-
text.gsub!(
|
128
|
-
text.gsub!(
|
140
|
+
text.gsub!($cleanup_regex_01){""}
|
141
|
+
text.gsub!($cleanup_regex_02){""}
|
142
|
+
text.gsub!($cleanup_regex_03){""}
|
143
|
+
text.gsub!($cleanup_regex_04){""}
|
144
|
+
text.gsub!($cleanup_regex_05){""}
|
145
|
+
text.gsub!($cleanup_regex_06){""}
|
146
|
+
text.gsub!($cleanup_regex_07){""}
|
147
|
+
text.gsub!($cleanup_regex_08){"\n\n"}
|
129
148
|
text.strip!
|
130
149
|
text << "\n\n"
|
131
150
|
end
|
151
|
+
|
132
152
|
#################### parser for nested structure ####################
|
133
153
|
|
134
154
|
def process_nested_structure(scanner, left, right, &block)
|
135
155
|
test = false
|
136
156
|
buffer = ""
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
buffer << str
|
155
|
-
has_left = true
|
156
|
-
when right
|
157
|
-
if has_left
|
158
|
-
buffer = buffer[0...-(left.size)]
|
159
|
-
contents = block.call(str[0...-(left.size)])
|
160
|
-
buffer << contents
|
161
|
-
break
|
162
|
-
else
|
157
|
+
begin
|
158
|
+
if left == "[" && right == "]"
|
159
|
+
regex = $single_square_bracket_regex
|
160
|
+
elsif left == "[[" && right == "]]"
|
161
|
+
regex = $double_square_bracket_regex
|
162
|
+
elsif left == "{" && right == "}"
|
163
|
+
regex = $single_curly_bracket_regex
|
164
|
+
elsif left == "{{" && right == "}}"
|
165
|
+
regex = $double_curly_bracket_regex
|
166
|
+
elsif left == "{|" && right == "|}"
|
167
|
+
regex = $curly_square_bracket_regex
|
168
|
+
else
|
169
|
+
regex = Regexp.new("(#{Regexp.escape(left)}|#{Regexp.escape(right)})")
|
170
|
+
end
|
171
|
+
while str = scanner.scan_until(regex)
|
172
|
+
case scanner[1]
|
173
|
+
when left
|
163
174
|
buffer << str
|
175
|
+
has_left = true
|
176
|
+
when right
|
177
|
+
if has_left
|
178
|
+
buffer = buffer[0...-(left.size)]
|
179
|
+
contents = block.call(str[0...-(left.size)])
|
180
|
+
buffer << contents
|
181
|
+
break
|
182
|
+
else
|
183
|
+
buffer << str
|
184
|
+
end
|
164
185
|
end
|
165
186
|
end
|
166
|
-
|
167
|
-
buffer << scanner.rest
|
187
|
+
buffer << scanner.rest
|
168
188
|
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
189
|
+
if buffer == scanner.string
|
190
|
+
return buffer
|
191
|
+
else
|
192
|
+
scanner.string = buffer
|
193
|
+
return process_nested_structure(scanner, left, right, &block) || ""
|
194
|
+
end
|
195
|
+
rescue => e
|
196
|
+
return scanner.string
|
174
197
|
end
|
175
|
-
# rescue => e
|
176
|
-
# return scanner.string
|
177
|
-
# end
|
178
198
|
end
|
179
199
|
|
180
200
|
#################### methods used from format_wiki ####################
|
@@ -217,12 +237,16 @@ module Wp2txt
|
|
217
237
|
def process_external_links!(str)
|
218
238
|
scanner = StringScanner.new(str)
|
219
239
|
result = process_nested_structure(scanner, "[", "]") do |contents|
|
220
|
-
|
221
|
-
|
222
|
-
when 1
|
223
|
-
parts.first || ""
|
240
|
+
if /\A\s.+\s\z/ =~ contents
|
241
|
+
" (#{contents.strip}) "
|
224
242
|
else
|
225
|
-
parts
|
243
|
+
parts = contents.split(" ", 2)
|
244
|
+
case parts.size
|
245
|
+
when 1
|
246
|
+
parts.first || ""
|
247
|
+
else
|
248
|
+
parts.last || ""
|
249
|
+
end
|
226
250
|
end
|
227
251
|
end
|
228
252
|
str.replace(result)
|
@@ -239,10 +263,6 @@ module Wp2txt
|
|
239
263
|
result = process_nested_structure(scanner, "{", "}") do |contents|
|
240
264
|
""
|
241
265
|
end
|
242
|
-
scanner = StringScanner.new(result)
|
243
|
-
result = process_nested_structure(scanner, "{{", "}}") do |contents|
|
244
|
-
""
|
245
|
-
end
|
246
266
|
str.replace(result)
|
247
267
|
end
|
248
268
|
|
@@ -310,7 +330,8 @@ module Wp2txt
|
|
310
330
|
end
|
311
331
|
|
312
332
|
def remove_html!(str)
|
313
|
-
["
|
333
|
+
str.gsub!(/<[^<>]+\/>/){""}
|
334
|
+
["div", "gallery", "timeline", "noinclude"].each do |tag|
|
314
335
|
scanner = StringScanner.new(str)
|
315
336
|
result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do |contents|
|
316
337
|
""
|
@@ -320,11 +341,11 @@ module Wp2txt
|
|
320
341
|
end
|
321
342
|
|
322
343
|
def remove_complex!(str)
|
323
|
-
str.gsub!(
|
324
|
-
str.gsub!(
|
325
|
-
str.gsub!(
|
326
|
-
str.gsub!(
|
327
|
-
str.gsub!(
|
344
|
+
str.gsub!($complex_regex_01){"《#{$1}》"}
|
345
|
+
str.gsub!($complex_regex_02){""}
|
346
|
+
str.gsub!($complex_regex_03){""}
|
347
|
+
str.gsub!($complex_regex_04){""}
|
348
|
+
str.gsub!($complex_regex_05){""}
|
328
349
|
end
|
329
350
|
|
330
351
|
def make_reference!(str)
|
@@ -340,6 +361,8 @@ module Wp2txt
|
|
340
361
|
parts = contents.split("|")
|
341
362
|
if /\A(?:lang|fontsize)\z/i =~ parts[0]
|
342
363
|
parts.shift
|
364
|
+
elsif /\Alang\-/i =~ parts[0]
|
365
|
+
parts.shift
|
343
366
|
elsif /\Alang=/i =~ parts[1]
|
344
367
|
parts.shift
|
345
368
|
end
|
@@ -347,10 +370,14 @@ module Wp2txt
|
|
347
370
|
if parts.size == 1
|
348
371
|
out = parts[0]
|
349
372
|
else
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
373
|
+
begin
|
374
|
+
keyval = parts[1].split("=")
|
375
|
+
if keyval.size > 1
|
376
|
+
out = keyval[1]
|
377
|
+
else
|
378
|
+
out = parts[1] || ""
|
379
|
+
end
|
380
|
+
rescue
|
354
381
|
out = parts[1] || ""
|
355
382
|
end
|
356
383
|
end
|
@@ -418,7 +445,7 @@ module Wp2txt
|
|
418
445
|
end
|
419
446
|
end
|
420
447
|
|
421
|
-
def rename(files)
|
448
|
+
def rename(files, ext = "txt")
|
422
449
|
# num of digits necessary to name the last file generated
|
423
450
|
maxwidth = 0
|
424
451
|
|
@@ -431,8 +458,9 @@ module Wp2txt
|
|
431
458
|
newname= f.sub(/\-(\d+)\z/) do
|
432
459
|
"-" + sprintf("%0#{maxwidth}d", $1.to_i)
|
433
460
|
end
|
434
|
-
File.rename(f, newname + "
|
461
|
+
File.rename(f, newname + ".#{ext}")
|
435
462
|
end
|
463
|
+
return true
|
436
464
|
end
|
437
465
|
|
438
466
|
# convert int of seconds to string in the format 00:00:00
|
@@ -448,8 +476,4 @@ module Wp2txt
|
|
448
476
|
return str
|
449
477
|
end
|
450
478
|
|
451
|
-
def decimal_format(i)
|
452
|
-
str = i.to_s.reverse
|
453
|
-
return str.scan(/.?.?./).join(',').reverse
|
454
|
-
end
|
455
479
|
end
|
data/lib/wp2txt/version.rb
CHANGED