wp2txt 0.9.2 → 0.9.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +68 -31
- data/bin/wp2txt +62 -53
- data/data/output_samples/testdata_en.txt +11923 -36921
- data/data/output_samples/testdata_en_categories.txt +132 -0
- data/data/output_samples/testdata_en_summary.txt +1368 -0
- data/data/output_samples/testdata_ja.txt +24812 -4686
- data/data/output_samples/testdata_ja_categories.txt +206 -0
- data/data/output_samples/testdata_ja_summary.txt +1684 -0
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/lib/wp2txt/article.rb +3 -2
- data/lib/wp2txt/utils.rb +51 -27
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +2 -2
- metadata +7 -3
data/data/testdata_en.bz2
CHANGED
Binary file
|
data/data/testdata_ja.bz2
CHANGED
Binary file
|
data/lib/wp2txt/article.rb
CHANGED
@@ -37,10 +37,11 @@ module Wp2txt
|
|
37
37
|
def initialize(text, title = "", strip_tmarker = false)
|
38
38
|
@title = title.strip
|
39
39
|
@strip_tmarker = strip_tmarker
|
40
|
-
convert_characters!(text)
|
40
|
+
convert_characters!(text)
|
41
|
+
remove_html!(text)
|
42
|
+
remove_complex!(text)
|
41
43
|
make_reference!(text)
|
42
44
|
remove_ref!(text)
|
43
|
-
|
44
45
|
parse text
|
45
46
|
end
|
46
47
|
|
data/lib/wp2txt/utils.rb
CHANGED
@@ -108,7 +108,7 @@ module Wp2txt
|
|
108
108
|
process_interwiki_links!(text)
|
109
109
|
process_external_links!(text)
|
110
110
|
unescape_nowiki!(text)
|
111
|
-
|
111
|
+
|
112
112
|
remove_directive!(text)
|
113
113
|
remove_emphasis!(text)
|
114
114
|
mndash!(text)
|
@@ -116,11 +116,15 @@ module Wp2txt
|
|
116
116
|
remove_tag!(text)
|
117
117
|
correct_inline_template!(text) unless $leave_inline_template
|
118
118
|
remove_templates!(text) unless $leave_inline_template
|
119
|
-
|
119
|
+
remove_table!(text) unless $leave_table
|
120
120
|
end
|
121
121
|
|
122
122
|
def cleanup!(text)
|
123
123
|
text.gsub!(/\[ref\]\s*\[\/ref\]/m){""}
|
124
|
+
text.gsub!(/^File:.+$/){""}
|
125
|
+
text.gsub!(/^\|.*$/){""}
|
126
|
+
text.gsub!(/^{{.*$/){""}
|
127
|
+
text.gsub!(/^}}.*$/){""}
|
124
128
|
text.gsub!(/\n\n\n+/m){"\n\n"}
|
125
129
|
text.strip!
|
126
130
|
text << "\n\n"
|
@@ -128,8 +132,9 @@ module Wp2txt
|
|
128
132
|
#################### parser for nested structure ####################
|
129
133
|
|
130
134
|
def process_nested_structure(scanner, left, right, &block)
|
135
|
+
test = false
|
131
136
|
buffer = ""
|
132
|
-
begin
|
137
|
+
# begin
|
133
138
|
if left == "[" && right == "]"
|
134
139
|
regex = $single_square_bracket_regex
|
135
140
|
elsif left == "[[" && right == "]]"
|
@@ -141,7 +146,7 @@ module Wp2txt
|
|
141
146
|
elsif left == "{|" && right == "|}"
|
142
147
|
regex = $curly_square_bracket_regex
|
143
148
|
else
|
144
|
-
regex = Regexp.new(
|
149
|
+
regex = Regexp.new("(#{Regexp.escape(left)}|#{Regexp.escape(right)})")
|
145
150
|
end
|
146
151
|
while str = scanner.scan_until(regex)
|
147
152
|
case scanner[1]
|
@@ -167,9 +172,9 @@ module Wp2txt
|
|
167
172
|
scanner.string = buffer
|
168
173
|
return process_nested_structure(scanner, left, right, &block) || ""
|
169
174
|
end
|
170
|
-
rescue => e
|
171
|
-
|
172
|
-
end
|
175
|
+
# rescue => e
|
176
|
+
# return scanner.string
|
177
|
+
# end
|
173
178
|
end
|
174
179
|
|
175
180
|
#################### methods used from format_wiki ####################
|
@@ -234,6 +239,10 @@ module Wp2txt
|
|
234
239
|
result = process_nested_structure(scanner, "{", "}") do |contents|
|
235
240
|
""
|
236
241
|
end
|
242
|
+
scanner = StringScanner.new(result)
|
243
|
+
result = process_nested_structure(scanner, "{{", "}}") do |contents|
|
244
|
+
""
|
245
|
+
end
|
237
246
|
str.replace(result)
|
238
247
|
end
|
239
248
|
|
@@ -299,6 +308,24 @@ module Wp2txt
|
|
299
308
|
def remove_ref!(str)
|
300
309
|
str.gsub!($format_ref_regex){""}
|
301
310
|
end
|
311
|
+
|
312
|
+
def remove_html!(str)
|
313
|
+
["div", "gallery", "timeline"].each do |tag|
|
314
|
+
scanner = StringScanner.new(str)
|
315
|
+
result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do |contents|
|
316
|
+
""
|
317
|
+
end
|
318
|
+
str.replace(result)
|
319
|
+
end
|
320
|
+
end
|
321
|
+
|
322
|
+
def remove_complex!(str)
|
323
|
+
str.gsub!(/(?:'')?\[https?\:[^\[\]]+?\s([^\]]++)?\](?:'')?/){$1}
|
324
|
+
str.gsub!(/(?:'')?\[https?\:[^\[\]]++\](?:'')?\s?/){""}
|
325
|
+
str.gsub!(/\<\<([^<>]++)\>\>\s?/){"《#{$1}》"}
|
326
|
+
str.gsub!(/\{\{(?:Infobox|efn|Sfn|div col|no col|bar box|formatnum\:|Refnest\||Refnest\||Col\||See also\||R\|)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}/im){""}
|
327
|
+
str.gsub!(/\[\[(?:File|ファイル)\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]/im){""}
|
328
|
+
end
|
302
329
|
|
303
330
|
def make_reference!(str)
|
304
331
|
str.gsub!($make_reference_regex_a){"\n"}
|
@@ -311,30 +338,28 @@ module Wp2txt
|
|
311
338
|
scanner = StringScanner.new(str)
|
312
339
|
result = process_nested_structure(scanner, "{{", "}}") do |contents|
|
313
340
|
parts = contents.split("|")
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
# end
|
341
|
+
if /\A(?:lang|fontsize)\z/i =~ parts[0]
|
342
|
+
parts.shift
|
343
|
+
elsif /\Alang=/i =~ parts[1]
|
344
|
+
parts.shift
|
345
|
+
end
|
346
|
+
|
347
|
+
if parts.size == 1
|
348
|
+
out = parts[0]
|
349
|
+
else
|
350
|
+
keyval = parts[1].split("=")
|
351
|
+
if keyval.size > 1
|
352
|
+
out = keyval[1]
|
353
|
+
else
|
354
|
+
out = parts[1] || ""
|
355
|
+
end
|
356
|
+
end
|
357
|
+
|
332
358
|
out.strip
|
333
359
|
end
|
334
360
|
str.replace result
|
335
361
|
end
|
336
362
|
|
337
|
-
|
338
363
|
#################### file related utilities ####################
|
339
364
|
|
340
365
|
# collect filenames recursively
|
@@ -427,5 +452,4 @@ module Wp2txt
|
|
427
452
|
str = i.to_s.reverse
|
428
453
|
return str.scan(/.?.?./).join(',').reverse
|
429
454
|
end
|
430
|
-
|
431
455
|
end
|
data/lib/wp2txt/version.rb
CHANGED
data/lib/wp2txt.rb
CHANGED
@@ -101,7 +101,7 @@ module Wp2txt
|
|
101
101
|
if /.bz2$/ =~ @input_file
|
102
102
|
unless NO_BZ2
|
103
103
|
file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
|
104
|
-
@parent.msg("WP2TXT is
|
104
|
+
@parent.msg("WP2TXT is spawning #{@num_threads} threads to process data \n", 0)
|
105
105
|
@parent.msg("Preparing ... This may take several minutes or more ", 0)
|
106
106
|
@infile_size = file_size(file)
|
107
107
|
@parent.msg("... Done.", 1)
|
@@ -113,7 +113,7 @@ module Wp2txt
|
|
113
113
|
else
|
114
114
|
file = IO.popen("bzip2 -c -d #{@input_file}")
|
115
115
|
end
|
116
|
-
@parent.msg("WP2TXT is
|
116
|
+
@parent.msg("WP2TXT is spawning #{@num_threads} threads to process data \n", 0)
|
117
117
|
@parent.msg("Preparing ... This may take several minutes or more ", 0)
|
118
118
|
@infile_size = file_size(file)
|
119
119
|
@parent.msg("... Done.", 1)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wp2txt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-08-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -84,7 +84,11 @@ files:
|
|
84
84
|
- bin/benchmark.rb
|
85
85
|
- bin/wp2txt
|
86
86
|
- data/output_samples/testdata_en.txt
|
87
|
+
- data/output_samples/testdata_en_categories.txt
|
88
|
+
- data/output_samples/testdata_en_summary.txt
|
87
89
|
- data/output_samples/testdata_ja.txt
|
90
|
+
- data/output_samples/testdata_ja_categories.txt
|
91
|
+
- data/output_samples/testdata_ja_summary.txt
|
88
92
|
- data/testdata_en.bz2
|
89
93
|
- data/testdata_ja.bz2
|
90
94
|
- lib/wp2txt.rb
|
@@ -114,7 +118,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
114
118
|
- !ruby/object:Gem::Version
|
115
119
|
version: '0'
|
116
120
|
requirements: []
|
117
|
-
rubygems_version: 3.3.
|
121
|
+
rubygems_version: 3.3.7
|
118
122
|
signing_key:
|
119
123
|
specification_version: 4
|
120
124
|
summary: Wikipedia dump to text converter
|