wp2txt 0.9.3 → 0.9.5.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +40 -25
- data/bin/wp2txt +13 -7
- data/data/output_samples/testdata_en.txt +11923 -36921
- data/data/output_samples/testdata_en_categories.txt +131 -823
- data/data/output_samples/testdata_en_summary.txt +1368 -0
- data/data/output_samples/testdata_ja.txt +24812 -4686
- data/data/output_samples/testdata_ja_categories.txt +205 -187
- data/data/output_samples/testdata_ja_summary.txt +1684 -0
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/lib/wp2txt/article.rb +3 -2
- data/lib/wp2txt/utils.rb +82 -54
- data/lib/wp2txt/version.rb +1 -1
- metadata +5 -3
data/data/testdata_en.bz2
CHANGED
Binary file
|
data/data/testdata_ja.bz2
CHANGED
Binary file
|
data/lib/wp2txt/article.rb
CHANGED
@@ -37,10 +37,11 @@ module Wp2txt
|
|
37
37
|
def initialize(text, title = "", strip_tmarker = false)
|
38
38
|
@title = title.strip
|
39
39
|
@strip_tmarker = strip_tmarker
|
40
|
-
convert_characters!(text)
|
40
|
+
convert_characters!(text)
|
41
|
+
remove_html!(text)
|
42
|
+
remove_complex!(text)
|
41
43
|
make_reference!(text)
|
42
44
|
remove_ref!(text)
|
43
|
-
|
44
45
|
parse text
|
45
46
|
end
|
46
47
|
|
data/lib/wp2txt/utils.rb
CHANGED
@@ -108,7 +108,7 @@ module Wp2txt
|
|
108
108
|
process_interwiki_links!(text)
|
109
109
|
process_external_links!(text)
|
110
110
|
unescape_nowiki!(text)
|
111
|
-
|
111
|
+
|
112
112
|
remove_directive!(text)
|
113
113
|
remove_emphasis!(text)
|
114
114
|
mndash!(text)
|
@@ -116,11 +116,15 @@ module Wp2txt
|
|
116
116
|
remove_tag!(text)
|
117
117
|
correct_inline_template!(text) unless $leave_inline_template
|
118
118
|
remove_templates!(text) unless $leave_inline_template
|
119
|
-
|
119
|
+
remove_table!(text) unless $leave_table
|
120
120
|
end
|
121
121
|
|
122
122
|
def cleanup!(text)
|
123
123
|
text.gsub!(/\[ref\]\s*\[\/ref\]/m){""}
|
124
|
+
text.gsub!(/^File:.+$/){""}
|
125
|
+
text.gsub!(/^\|.*$/){""}
|
126
|
+
text.gsub!(/^{{.*$/){""}
|
127
|
+
text.gsub!(/^}}.*$/){""}
|
124
128
|
text.gsub!(/\n\n\n+/m){"\n\n"}
|
125
129
|
text.strip!
|
126
130
|
text << "\n\n"
|
@@ -128,45 +132,46 @@ module Wp2txt
|
|
128
132
|
#################### parser for nested structure ####################
|
129
133
|
|
130
134
|
def process_nested_structure(scanner, left, right, &block)
|
135
|
+
test = false
|
131
136
|
buffer = ""
|
132
137
|
begin
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
buffer << str
|
150
|
-
has_left = true
|
151
|
-
when right
|
152
|
-
if has_left
|
153
|
-
buffer = buffer[0...-(left.size)]
|
154
|
-
contents = block.call(str[0...-(left.size)])
|
155
|
-
buffer << contents
|
156
|
-
break
|
157
|
-
else
|
138
|
+
if left == "[" && right == "]"
|
139
|
+
regex = $single_square_bracket_regex
|
140
|
+
elsif left == "[[" && right == "]]"
|
141
|
+
regex = $double_square_bracket_regex
|
142
|
+
elsif left == "{" && right == "}"
|
143
|
+
regex = $single_curly_bracket_regex
|
144
|
+
elsif left == "{{" && right == "}}"
|
145
|
+
regex = $double_curly_bracket_regex
|
146
|
+
elsif left == "{|" && right == "|}"
|
147
|
+
regex = $curly_square_bracket_regex
|
148
|
+
else
|
149
|
+
regex = Regexp.new("(#{Regexp.escape(left)}|#{Regexp.escape(right)})")
|
150
|
+
end
|
151
|
+
while str = scanner.scan_until(regex)
|
152
|
+
case scanner[1]
|
153
|
+
when left
|
158
154
|
buffer << str
|
155
|
+
has_left = true
|
156
|
+
when right
|
157
|
+
if has_left
|
158
|
+
buffer = buffer[0...-(left.size)]
|
159
|
+
contents = block.call(str[0...-(left.size)])
|
160
|
+
buffer << contents
|
161
|
+
break
|
162
|
+
else
|
163
|
+
buffer << str
|
164
|
+
end
|
159
165
|
end
|
160
166
|
end
|
161
|
-
|
162
|
-
buffer << scanner.rest
|
167
|
+
buffer << scanner.rest
|
163
168
|
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
169
|
+
if buffer == scanner.string
|
170
|
+
return buffer
|
171
|
+
else
|
172
|
+
scanner.string = buffer
|
173
|
+
return process_nested_structure(scanner, left, right, &block) || ""
|
174
|
+
end
|
170
175
|
rescue => e
|
171
176
|
return scanner.string
|
172
177
|
end
|
@@ -234,6 +239,10 @@ module Wp2txt
|
|
234
239
|
result = process_nested_structure(scanner, "{", "}") do |contents|
|
235
240
|
""
|
236
241
|
end
|
242
|
+
scanner = StringScanner.new(result)
|
243
|
+
result = process_nested_structure(scanner, "{{", "}}") do |contents|
|
244
|
+
""
|
245
|
+
end
|
237
246
|
str.replace(result)
|
238
247
|
end
|
239
248
|
|
@@ -299,6 +308,24 @@ module Wp2txt
|
|
299
308
|
def remove_ref!(str)
|
300
309
|
str.gsub!($format_ref_regex){""}
|
301
310
|
end
|
311
|
+
|
312
|
+
def remove_html!(str)
|
313
|
+
["div", "gallery", "timeline"].each do |tag|
|
314
|
+
scanner = StringScanner.new(str)
|
315
|
+
result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do |contents|
|
316
|
+
""
|
317
|
+
end
|
318
|
+
str.replace(result)
|
319
|
+
end
|
320
|
+
end
|
321
|
+
|
322
|
+
def remove_complex!(str)
|
323
|
+
str.gsub!(/(?:'')?\[https?\:[^\[\]]+?\s([^\]]++)?\](?:'')?/){$1}
|
324
|
+
str.gsub!(/(?:'')?\[https?\:[^\[\]]++\](?:'')?\s?/){""}
|
325
|
+
str.gsub!(/\<\<([^<>]++)\>\>\s?/){"《#{$1}》"}
|
326
|
+
str.gsub!(/\{\{(?:Infobox|efn|Sfn|div col|no col|bar box|formatnum\:|Refnest\||Refnest\||Col\||See also\||R\|)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}/im){""}
|
327
|
+
str.gsub!(/\[\[(?:File|ファイル)\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]/im){""}
|
328
|
+
end
|
302
329
|
|
303
330
|
def make_reference!(str)
|
304
331
|
str.gsub!($make_reference_regex_a){"\n"}
|
@@ -311,30 +338,32 @@ module Wp2txt
|
|
311
338
|
scanner = StringScanner.new(str)
|
312
339
|
result = process_nested_structure(scanner, "{{", "}}") do |contents|
|
313
340
|
parts = contents.split("|")
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
341
|
+
if /\A(?:lang|fontsize)\z/i =~ parts[0]
|
342
|
+
parts.shift
|
343
|
+
elsif /\Alang=/i =~ parts[1]
|
344
|
+
parts.shift
|
345
|
+
end
|
346
|
+
|
347
|
+
if parts.size == 1
|
348
|
+
out = parts[0]
|
349
|
+
else
|
350
|
+
begin
|
351
|
+
keyval = parts[1].split("=")
|
352
|
+
if keyval.size > 1
|
353
|
+
out = keyval[1]
|
354
|
+
else
|
355
|
+
out = parts[1] || ""
|
328
356
|
end
|
329
|
-
|
330
|
-
|
331
|
-
|
357
|
+
rescue
|
358
|
+
out = parts[1] || ""
|
359
|
+
end
|
360
|
+
end
|
361
|
+
|
332
362
|
out.strip
|
333
363
|
end
|
334
364
|
str.replace result
|
335
365
|
end
|
336
366
|
|
337
|
-
|
338
367
|
#################### file related utilities ####################
|
339
368
|
|
340
369
|
# collect filenames recursively
|
@@ -427,5 +456,4 @@ module Wp2txt
|
|
427
456
|
str = i.to_s.reverse
|
428
457
|
return str.scan(/.?.?./).join(',').reverse
|
429
458
|
end
|
430
|
-
|
431
459
|
end
|
data/lib/wp2txt/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wp2txt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.5.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-08-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -85,8 +85,10 @@ files:
|
|
85
85
|
- bin/wp2txt
|
86
86
|
- data/output_samples/testdata_en.txt
|
87
87
|
- data/output_samples/testdata_en_categories.txt
|
88
|
+
- data/output_samples/testdata_en_summary.txt
|
88
89
|
- data/output_samples/testdata_ja.txt
|
89
90
|
- data/output_samples/testdata_ja_categories.txt
|
91
|
+
- data/output_samples/testdata_ja_summary.txt
|
90
92
|
- data/testdata_en.bz2
|
91
93
|
- data/testdata_ja.bz2
|
92
94
|
- lib/wp2txt.rb
|
@@ -116,7 +118,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
116
118
|
- !ruby/object:Gem::Version
|
117
119
|
version: '0'
|
118
120
|
requirements: []
|
119
|
-
rubygems_version: 3.3.
|
121
|
+
rubygems_version: 3.3.7
|
120
122
|
signing_key:
|
121
123
|
specification_version: 4
|
122
124
|
summary: Wikipedia dump to text converter
|