wp2txt 0.9.2 → 0.9.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +68 -31
- data/bin/wp2txt +62 -53
- data/data/output_samples/testdata_en.txt +11923 -36921
- data/data/output_samples/testdata_en_categories.txt +132 -0
- data/data/output_samples/testdata_en_summary.txt +1368 -0
- data/data/output_samples/testdata_ja.txt +24812 -4686
- data/data/output_samples/testdata_ja_categories.txt +206 -0
- data/data/output_samples/testdata_ja_summary.txt +1684 -0
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/lib/wp2txt/article.rb +3 -2
- data/lib/wp2txt/utils.rb +51 -27
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +2 -2
- metadata +7 -3
data/data/testdata_en.bz2
CHANGED
Binary file
|
data/data/testdata_ja.bz2
CHANGED
Binary file
|
data/lib/wp2txt/article.rb
CHANGED
@@ -37,10 +37,11 @@ module Wp2txt
|
|
37
37
|
def initialize(text, title = "", strip_tmarker = false)
|
38
38
|
@title = title.strip
|
39
39
|
@strip_tmarker = strip_tmarker
|
40
|
-
convert_characters!(text)
|
40
|
+
convert_characters!(text)
|
41
|
+
remove_html!(text)
|
42
|
+
remove_complex!(text)
|
41
43
|
make_reference!(text)
|
42
44
|
remove_ref!(text)
|
43
|
-
|
44
45
|
parse text
|
45
46
|
end
|
46
47
|
|
data/lib/wp2txt/utils.rb
CHANGED
@@ -108,7 +108,7 @@ module Wp2txt
|
|
108
108
|
process_interwiki_links!(text)
|
109
109
|
process_external_links!(text)
|
110
110
|
unescape_nowiki!(text)
|
111
|
-
|
111
|
+
|
112
112
|
remove_directive!(text)
|
113
113
|
remove_emphasis!(text)
|
114
114
|
mndash!(text)
|
@@ -116,11 +116,15 @@ module Wp2txt
|
|
116
116
|
remove_tag!(text)
|
117
117
|
correct_inline_template!(text) unless $leave_inline_template
|
118
118
|
remove_templates!(text) unless $leave_inline_template
|
119
|
-
|
119
|
+
remove_table!(text) unless $leave_table
|
120
120
|
end
|
121
121
|
|
122
122
|
def cleanup!(text)
|
123
123
|
text.gsub!(/\[ref\]\s*\[\/ref\]/m){""}
|
124
|
+
text.gsub!(/^File:.+$/){""}
|
125
|
+
text.gsub!(/^\|.*$/){""}
|
126
|
+
text.gsub!(/^{{.*$/){""}
|
127
|
+
text.gsub!(/^}}.*$/){""}
|
124
128
|
text.gsub!(/\n\n\n+/m){"\n\n"}
|
125
129
|
text.strip!
|
126
130
|
text << "\n\n"
|
@@ -128,8 +132,9 @@ module Wp2txt
|
|
128
132
|
#################### parser for nested structure ####################
|
129
133
|
|
130
134
|
def process_nested_structure(scanner, left, right, &block)
|
135
|
+
test = false
|
131
136
|
buffer = ""
|
132
|
-
begin
|
137
|
+
# begin
|
133
138
|
if left == "[" && right == "]"
|
134
139
|
regex = $single_square_bracket_regex
|
135
140
|
elsif left == "[[" && right == "]]"
|
@@ -141,7 +146,7 @@ module Wp2txt
|
|
141
146
|
elsif left == "{|" && right == "|}"
|
142
147
|
regex = $curly_square_bracket_regex
|
143
148
|
else
|
144
|
-
regex = Regexp.new(
|
149
|
+
regex = Regexp.new("(#{Regexp.escape(left)}|#{Regexp.escape(right)})")
|
145
150
|
end
|
146
151
|
while str = scanner.scan_until(regex)
|
147
152
|
case scanner[1]
|
@@ -167,9 +172,9 @@ module Wp2txt
|
|
167
172
|
scanner.string = buffer
|
168
173
|
return process_nested_structure(scanner, left, right, &block) || ""
|
169
174
|
end
|
170
|
-
rescue => e
|
171
|
-
|
172
|
-
end
|
175
|
+
# rescue => e
|
176
|
+
# return scanner.string
|
177
|
+
# end
|
173
178
|
end
|
174
179
|
|
175
180
|
#################### methods used from format_wiki ####################
|
@@ -234,6 +239,10 @@ module Wp2txt
|
|
234
239
|
result = process_nested_structure(scanner, "{", "}") do |contents|
|
235
240
|
""
|
236
241
|
end
|
242
|
+
scanner = StringScanner.new(result)
|
243
|
+
result = process_nested_structure(scanner, "{{", "}}") do |contents|
|
244
|
+
""
|
245
|
+
end
|
237
246
|
str.replace(result)
|
238
247
|
end
|
239
248
|
|
@@ -299,6 +308,24 @@ module Wp2txt
|
|
299
308
|
def remove_ref!(str)
|
300
309
|
str.gsub!($format_ref_regex){""}
|
301
310
|
end
|
311
|
+
|
312
|
+
def remove_html!(str)
|
313
|
+
["div", "gallery", "timeline"].each do |tag|
|
314
|
+
scanner = StringScanner.new(str)
|
315
|
+
result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do |contents|
|
316
|
+
""
|
317
|
+
end
|
318
|
+
str.replace(result)
|
319
|
+
end
|
320
|
+
end
|
321
|
+
|
322
|
+
def remove_complex!(str)
|
323
|
+
str.gsub!(/(?:'')?\[https?\:[^\[\]]+?\s([^\]]++)?\](?:'')?/){$1}
|
324
|
+
str.gsub!(/(?:'')?\[https?\:[^\[\]]++\](?:'')?\s?/){""}
|
325
|
+
str.gsub!(/\<\<([^<>]++)\>\>\s?/){"《#{$1}》"}
|
326
|
+
str.gsub!(/\{\{(?:Infobox|efn|Sfn|div col|no col|bar box|formatnum\:|Refnest\||Refnest\||Col\||See also\||R\|)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}/im){""}
|
327
|
+
str.gsub!(/\[\[(?:File|ファイル)\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]/im){""}
|
328
|
+
end
|
302
329
|
|
303
330
|
def make_reference!(str)
|
304
331
|
str.gsub!($make_reference_regex_a){"\n"}
|
@@ -311,30 +338,28 @@ module Wp2txt
|
|
311
338
|
scanner = StringScanner.new(str)
|
312
339
|
result = process_nested_structure(scanner, "{{", "}}") do |contents|
|
313
340
|
parts = contents.split("|")
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
# end
|
341
|
+
if /\A(?:lang|fontsize)\z/i =~ parts[0]
|
342
|
+
parts.shift
|
343
|
+
elsif /\Alang=/i =~ parts[1]
|
344
|
+
parts.shift
|
345
|
+
end
|
346
|
+
|
347
|
+
if parts.size == 1
|
348
|
+
out = parts[0]
|
349
|
+
else
|
350
|
+
keyval = parts[1].split("=")
|
351
|
+
if keyval.size > 1
|
352
|
+
out = keyval[1]
|
353
|
+
else
|
354
|
+
out = parts[1] || ""
|
355
|
+
end
|
356
|
+
end
|
357
|
+
|
332
358
|
out.strip
|
333
359
|
end
|
334
360
|
str.replace result
|
335
361
|
end
|
336
362
|
|
337
|
-
|
338
363
|
#################### file related utilities ####################
|
339
364
|
|
340
365
|
# collect filenames recursively
|
@@ -427,5 +452,4 @@ module Wp2txt
|
|
427
452
|
str = i.to_s.reverse
|
428
453
|
return str.scan(/.?.?./).join(',').reverse
|
429
454
|
end
|
430
|
-
|
431
455
|
end
|
data/lib/wp2txt/version.rb
CHANGED
data/lib/wp2txt.rb
CHANGED
@@ -101,7 +101,7 @@ module Wp2txt
|
|
101
101
|
if /.bz2$/ =~ @input_file
|
102
102
|
unless NO_BZ2
|
103
103
|
file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
|
104
|
-
@parent.msg("WP2TXT is
|
104
|
+
@parent.msg("WP2TXT is spawning #{@num_threads} threads to process data \n", 0)
|
105
105
|
@parent.msg("Preparing ... This may take several minutes or more ", 0)
|
106
106
|
@infile_size = file_size(file)
|
107
107
|
@parent.msg("... Done.", 1)
|
@@ -113,7 +113,7 @@ module Wp2txt
|
|
113
113
|
else
|
114
114
|
file = IO.popen("bzip2 -c -d #{@input_file}")
|
115
115
|
end
|
116
|
-
@parent.msg("WP2TXT is
|
116
|
+
@parent.msg("WP2TXT is spawning #{@num_threads} threads to process data \n", 0)
|
117
117
|
@parent.msg("Preparing ... This may take several minutes or more ", 0)
|
118
118
|
@infile_size = file_size(file)
|
119
119
|
@parent.msg("... Done.", 1)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wp2txt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-08-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -84,7 +84,11 @@ files:
|
|
84
84
|
- bin/benchmark.rb
|
85
85
|
- bin/wp2txt
|
86
86
|
- data/output_samples/testdata_en.txt
|
87
|
+
- data/output_samples/testdata_en_categories.txt
|
88
|
+
- data/output_samples/testdata_en_summary.txt
|
87
89
|
- data/output_samples/testdata_ja.txt
|
90
|
+
- data/output_samples/testdata_ja_categories.txt
|
91
|
+
- data/output_samples/testdata_ja_summary.txt
|
88
92
|
- data/testdata_en.bz2
|
89
93
|
- data/testdata_ja.bz2
|
90
94
|
- lib/wp2txt.rb
|
@@ -114,7 +118,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
114
118
|
- !ruby/object:Gem::Version
|
115
119
|
version: '0'
|
116
120
|
requirements: []
|
117
|
-
rubygems_version: 3.3.
|
121
|
+
rubygems_version: 3.3.7
|
118
122
|
signing_key:
|
119
123
|
specification_version: 4
|
120
124
|
summary: Wikipedia dump to text converter
|