wp2txt 0.9.4 → 0.9.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +34 -17
- data/bin/wp2txt +7 -6
- data/data/output_samples/testdata_en.txt +11923 -36921
- data/data/output_samples/testdata_en_categories.txt +107 -182
- data/data/output_samples/testdata_en_summary.txt +1368 -0
- data/data/output_samples/testdata_ja.txt +24812 -4686
- data/data/output_samples/testdata_ja_categories.txt +202 -44
- data/data/output_samples/testdata_ja_summary.txt +1684 -0
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/lib/wp2txt/article.rb +3 -2
- data/lib/wp2txt/utils.rb +51 -27
- data/lib/wp2txt/version.rb +1 -1
- metadata +4 -2
data/data/testdata_en.bz2
CHANGED
Binary file
|
data/data/testdata_ja.bz2
CHANGED
Binary file
|
data/lib/wp2txt/article.rb
CHANGED
@@ -37,10 +37,11 @@ module Wp2txt
|
|
37
37
|
def initialize(text, title = "", strip_tmarker = false)
|
38
38
|
@title = title.strip
|
39
39
|
@strip_tmarker = strip_tmarker
|
40
|
-
convert_characters!(text)
|
40
|
+
convert_characters!(text)
|
41
|
+
remove_html!(text)
|
42
|
+
remove_complex!(text)
|
41
43
|
make_reference!(text)
|
42
44
|
remove_ref!(text)
|
43
|
-
|
44
45
|
parse text
|
45
46
|
end
|
46
47
|
|
data/lib/wp2txt/utils.rb
CHANGED
@@ -108,7 +108,7 @@ module Wp2txt
|
|
108
108
|
process_interwiki_links!(text)
|
109
109
|
process_external_links!(text)
|
110
110
|
unescape_nowiki!(text)
|
111
|
-
|
111
|
+
|
112
112
|
remove_directive!(text)
|
113
113
|
remove_emphasis!(text)
|
114
114
|
mndash!(text)
|
@@ -116,11 +116,15 @@ module Wp2txt
|
|
116
116
|
remove_tag!(text)
|
117
117
|
correct_inline_template!(text) unless $leave_inline_template
|
118
118
|
remove_templates!(text) unless $leave_inline_template
|
119
|
-
|
119
|
+
remove_table!(text) unless $leave_table
|
120
120
|
end
|
121
121
|
|
122
122
|
def cleanup!(text)
|
123
123
|
text.gsub!(/\[ref\]\s*\[\/ref\]/m){""}
|
124
|
+
text.gsub!(/^File:.+$/){""}
|
125
|
+
text.gsub!(/^\|.*$/){""}
|
126
|
+
text.gsub!(/^{{.*$/){""}
|
127
|
+
text.gsub!(/^}}.*$/){""}
|
124
128
|
text.gsub!(/\n\n\n+/m){"\n\n"}
|
125
129
|
text.strip!
|
126
130
|
text << "\n\n"
|
@@ -128,8 +132,9 @@ module Wp2txt
|
|
128
132
|
#################### parser for nested structure ####################
|
129
133
|
|
130
134
|
def process_nested_structure(scanner, left, right, &block)
|
135
|
+
test = false
|
131
136
|
buffer = ""
|
132
|
-
begin
|
137
|
+
# begin
|
133
138
|
if left == "[" && right == "]"
|
134
139
|
regex = $single_square_bracket_regex
|
135
140
|
elsif left == "[[" && right == "]]"
|
@@ -141,7 +146,7 @@ module Wp2txt
|
|
141
146
|
elsif left == "{|" && right == "|}"
|
142
147
|
regex = $curly_square_bracket_regex
|
143
148
|
else
|
144
|
-
regex = Regexp.new(
|
149
|
+
regex = Regexp.new("(#{Regexp.escape(left)}|#{Regexp.escape(right)})")
|
145
150
|
end
|
146
151
|
while str = scanner.scan_until(regex)
|
147
152
|
case scanner[1]
|
@@ -167,9 +172,9 @@ module Wp2txt
|
|
167
172
|
scanner.string = buffer
|
168
173
|
return process_nested_structure(scanner, left, right, &block) || ""
|
169
174
|
end
|
170
|
-
rescue => e
|
171
|
-
|
172
|
-
end
|
175
|
+
# rescue => e
|
176
|
+
# return scanner.string
|
177
|
+
# end
|
173
178
|
end
|
174
179
|
|
175
180
|
#################### methods used from format_wiki ####################
|
@@ -234,6 +239,10 @@ module Wp2txt
|
|
234
239
|
result = process_nested_structure(scanner, "{", "}") do |contents|
|
235
240
|
""
|
236
241
|
end
|
242
|
+
scanner = StringScanner.new(result)
|
243
|
+
result = process_nested_structure(scanner, "{{", "}}") do |contents|
|
244
|
+
""
|
245
|
+
end
|
237
246
|
str.replace(result)
|
238
247
|
end
|
239
248
|
|
@@ -299,6 +308,24 @@ module Wp2txt
|
|
299
308
|
def remove_ref!(str)
|
300
309
|
str.gsub!($format_ref_regex){""}
|
301
310
|
end
|
311
|
+
|
312
|
+
def remove_html!(str)
|
313
|
+
["div", "gallery", "timeline"].each do |tag|
|
314
|
+
scanner = StringScanner.new(str)
|
315
|
+
result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do |contents|
|
316
|
+
""
|
317
|
+
end
|
318
|
+
str.replace(result)
|
319
|
+
end
|
320
|
+
end
|
321
|
+
|
322
|
+
def remove_complex!(str)
|
323
|
+
str.gsub!(/(?:'')?\[https?\:[^\[\]]+?\s([^\]]++)?\](?:'')?/){$1}
|
324
|
+
str.gsub!(/(?:'')?\[https?\:[^\[\]]++\](?:'')?\s?/){""}
|
325
|
+
str.gsub!(/\<\<([^<>]++)\>\>\s?/){"《#{$1}》"}
|
326
|
+
str.gsub!(/\{\{(?:Infobox|efn|Sfn|div col|no col|bar box|formatnum\:|Refnest\||Refnest\||Col\||See also\||R\|)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}/im){""}
|
327
|
+
str.gsub!(/\[\[(?:File|ファイル)\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]/im){""}
|
328
|
+
end
|
302
329
|
|
303
330
|
def make_reference!(str)
|
304
331
|
str.gsub!($make_reference_regex_a){"\n"}
|
@@ -311,30 +338,28 @@ module Wp2txt
|
|
311
338
|
scanner = StringScanner.new(str)
|
312
339
|
result = process_nested_structure(scanner, "{{", "}}") do |contents|
|
313
340
|
parts = contents.split("|")
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
# end
|
341
|
+
if /\A(?:lang|fontsize)\z/i =~ parts[0]
|
342
|
+
parts.shift
|
343
|
+
elsif /\Alang=/i =~ parts[1]
|
344
|
+
parts.shift
|
345
|
+
end
|
346
|
+
|
347
|
+
if parts.size == 1
|
348
|
+
out = parts[0]
|
349
|
+
else
|
350
|
+
keyval = parts[1].split("=")
|
351
|
+
if keyval.size > 1
|
352
|
+
out = keyval[1]
|
353
|
+
else
|
354
|
+
out = parts[1] || ""
|
355
|
+
end
|
356
|
+
end
|
357
|
+
|
332
358
|
out.strip
|
333
359
|
end
|
334
360
|
str.replace result
|
335
361
|
end
|
336
362
|
|
337
|
-
|
338
363
|
#################### file related utilities ####################
|
339
364
|
|
340
365
|
# collect filenames recursively
|
@@ -427,5 +452,4 @@ module Wp2txt
|
|
427
452
|
str = i.to_s.reverse
|
428
453
|
return str.scan(/.?.?./).join(',').reverse
|
429
454
|
end
|
430
|
-
|
431
455
|
end
|
data/lib/wp2txt/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wp2txt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-08-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -85,8 +85,10 @@ files:
|
|
85
85
|
- bin/wp2txt
|
86
86
|
- data/output_samples/testdata_en.txt
|
87
87
|
- data/output_samples/testdata_en_categories.txt
|
88
|
+
- data/output_samples/testdata_en_summary.txt
|
88
89
|
- data/output_samples/testdata_ja.txt
|
89
90
|
- data/output_samples/testdata_ja_categories.txt
|
91
|
+
- data/output_samples/testdata_ja_summary.txt
|
90
92
|
- data/testdata_en.bz2
|
91
93
|
- data/testdata_ja.bz2
|
92
94
|
- lib/wp2txt.rb
|