wp2txt 0.9.3 → 0.9.5.1

Sign up to get free protection for your applications and to get access to all the features.
data/data/testdata_en.bz2 CHANGED
Binary file
data/data/testdata_ja.bz2 CHANGED
Binary file
@@ -37,10 +37,11 @@ module Wp2txt
37
37
  def initialize(text, title = "", strip_tmarker = false)
38
38
  @title = title.strip
39
39
  @strip_tmarker = strip_tmarker
40
- convert_characters!(text)
40
+ convert_characters!(text)
41
+ remove_html!(text)
42
+ remove_complex!(text)
41
43
  make_reference!(text)
42
44
  remove_ref!(text)
43
-
44
45
  parse text
45
46
  end
46
47
 
data/lib/wp2txt/utils.rb CHANGED
@@ -108,7 +108,7 @@ module Wp2txt
108
108
  process_interwiki_links!(text)
109
109
  process_external_links!(text)
110
110
  unescape_nowiki!(text)
111
- #####
111
+
112
112
  remove_directive!(text)
113
113
  remove_emphasis!(text)
114
114
  mndash!(text)
@@ -116,11 +116,15 @@ module Wp2txt
116
116
  remove_tag!(text)
117
117
  correct_inline_template!(text) unless $leave_inline_template
118
118
  remove_templates!(text) unless $leave_inline_template
119
- # remove_table!(text) unless $leave_table
119
+ remove_table!(text) unless $leave_table
120
120
  end
121
121
 
122
122
  def cleanup!(text)
123
123
  text.gsub!(/\[ref\]\s*\[\/ref\]/m){""}
124
+ text.gsub!(/^File:.+$/){""}
125
+ text.gsub!(/^\|.*$/){""}
126
+ text.gsub!(/^{{.*$/){""}
127
+ text.gsub!(/^}}.*$/){""}
124
128
  text.gsub!(/\n\n\n+/m){"\n\n"}
125
129
  text.strip!
126
130
  text << "\n\n"
@@ -128,45 +132,46 @@ module Wp2txt
128
132
  #################### parser for nested structure ####################
129
133
 
130
134
  def process_nested_structure(scanner, left, right, &block)
135
+ test = false
131
136
  buffer = ""
132
137
  begin
133
- if left == "[" && right == "]"
134
- regex = $single_square_bracket_regex
135
- elsif left == "[[" && right == "]]"
136
- regex = $double_square_bracket_regex
137
- elsif left == "{" && right == "}"
138
- regex = $single_curly_bracket_regex
139
- elsif left == "{{" && right == "}}"
140
- regex = $double_curly_bracket_regex
141
- elsif left == "{|" && right == "|}"
142
- regex = $curly_square_bracket_regex
143
- else
144
- regex = Regexp.new('(#{Regexp.escape(left)}|#{Regexp.escape(right)})', Regexp::MULTILINE)
145
- end
146
- while str = scanner.scan_until(regex)
147
- case scanner[1]
148
- when left
149
- buffer << str
150
- has_left = true
151
- when right
152
- if has_left
153
- buffer = buffer[0...-(left.size)]
154
- contents = block.call(str[0...-(left.size)])
155
- buffer << contents
156
- break
157
- else
138
+ if left == "[" && right == "]"
139
+ regex = $single_square_bracket_regex
140
+ elsif left == "[[" && right == "]]"
141
+ regex = $double_square_bracket_regex
142
+ elsif left == "{" && right == "}"
143
+ regex = $single_curly_bracket_regex
144
+ elsif left == "{{" && right == "}}"
145
+ regex = $double_curly_bracket_regex
146
+ elsif left == "{|" && right == "|}"
147
+ regex = $curly_square_bracket_regex
148
+ else
149
+ regex = Regexp.new("(#{Regexp.escape(left)}|#{Regexp.escape(right)})")
150
+ end
151
+ while str = scanner.scan_until(regex)
152
+ case scanner[1]
153
+ when left
158
154
  buffer << str
155
+ has_left = true
156
+ when right
157
+ if has_left
158
+ buffer = buffer[0...-(left.size)]
159
+ contents = block.call(str[0...-(left.size)])
160
+ buffer << contents
161
+ break
162
+ else
163
+ buffer << str
164
+ end
159
165
  end
160
166
  end
161
- end
162
- buffer << scanner.rest
167
+ buffer << scanner.rest
163
168
 
164
- if buffer == scanner.string
165
- return buffer
166
- else
167
- scanner.string = buffer
168
- return process_nested_structure(scanner, left, right, &block) || ""
169
- end
169
+ if buffer == scanner.string
170
+ return buffer
171
+ else
172
+ scanner.string = buffer
173
+ return process_nested_structure(scanner, left, right, &block) || ""
174
+ end
170
175
  rescue => e
171
176
  return scanner.string
172
177
  end
@@ -234,6 +239,10 @@ module Wp2txt
234
239
  result = process_nested_structure(scanner, "{", "}") do |contents|
235
240
  ""
236
241
  end
242
+ scanner = StringScanner.new(result)
243
+ result = process_nested_structure(scanner, "{{", "}}") do |contents|
244
+ ""
245
+ end
237
246
  str.replace(result)
238
247
  end
239
248
 
@@ -299,6 +308,24 @@ module Wp2txt
299
308
  def remove_ref!(str)
300
309
  str.gsub!($format_ref_regex){""}
301
310
  end
311
+
312
+ def remove_html!(str)
313
+ ["div", "gallery", "timeline"].each do |tag|
314
+ scanner = StringScanner.new(str)
315
+ result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do |contents|
316
+ ""
317
+ end
318
+ str.replace(result)
319
+ end
320
+ end
321
+
322
+ def remove_complex!(str)
323
+ str.gsub!(/(?:'')?\[https?\:[^\[\]]+?\s([^\]]++)?\](?:'')?/){$1}
324
+ str.gsub!(/(?:'')?\[https?\:[^\[\]]++\](?:'')?\s?/){""}
325
+ str.gsub!(/\<\<([^<>]++)\>\>\s?/){"《#{$1}》"}
326
+ str.gsub!(/\{\{(?:Infobox|efn|Sfn|div col|no col|bar box|formatnum\:|Refnest\||Refnest\||Col\||See also\||R\|)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}/im){""}
327
+ str.gsub!(/\[\[(?:File|ファイル)\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]/im){""}
328
+ end
302
329
 
303
330
  def make_reference!(str)
304
331
  str.gsub!($make_reference_regex_a){"\n"}
@@ -311,30 +338,32 @@ module Wp2txt
311
338
  scanner = StringScanner.new(str)
312
339
  result = process_nested_structure(scanner, "{{", "}}") do |contents|
313
340
  parts = contents.split("|")
314
- # type_code = parts.first
315
- # case type_code
316
- # when $type_code_regex
317
- # out = parts[-1]
318
- # else
319
- # case parts.size
320
- # when 0
321
- # out = ""
322
- # when 1
323
- # out = parts.first || ""
324
- # else
325
- # while parts.size > 2 && parts.last.split("=").size > 1
326
- while parts.size > 1 && parts.last.split("=").size > 1
327
- parts.pop
341
+ if /\A(?:lang|fontsize)\z/i =~ parts[0]
342
+ parts.shift
343
+ elsif /\Alang=/i =~ parts[1]
344
+ parts.shift
345
+ end
346
+
347
+ if parts.size == 1
348
+ out = parts[0]
349
+ else
350
+ begin
351
+ keyval = parts[1].split("=")
352
+ if keyval.size > 1
353
+ out = keyval[1]
354
+ else
355
+ out = parts[1] || ""
328
356
  end
329
- out = parts.last || ""
330
- # end
331
- # end
357
+ rescue
358
+ out = parts[1] || ""
359
+ end
360
+ end
361
+
332
362
  out.strip
333
363
  end
334
364
  str.replace result
335
365
  end
336
366
 
337
-
338
367
  #################### file related utilities ####################
339
368
 
340
369
  # collect filenames recursively
@@ -427,5 +456,4 @@ module Wp2txt
427
456
  str = i.to_s.reverse
428
457
  return str.scan(/.?.?./).join(',').reverse
429
458
  end
430
-
431
459
  end
@@ -1,3 +1,3 @@
1
1
  module Wp2txt
2
- VERSION = "0.9.3"
2
+ VERSION = "0.9.5.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wp2txt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.3
4
+ version: 0.9.5.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-07-29 00:00:00.000000000 Z
11
+ date: 2022-08-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -85,8 +85,10 @@ files:
85
85
  - bin/wp2txt
86
86
  - data/output_samples/testdata_en.txt
87
87
  - data/output_samples/testdata_en_categories.txt
88
+ - data/output_samples/testdata_en_summary.txt
88
89
  - data/output_samples/testdata_ja.txt
89
90
  - data/output_samples/testdata_ja_categories.txt
91
+ - data/output_samples/testdata_ja_summary.txt
90
92
  - data/testdata_en.bz2
91
93
  - data/testdata_ja.bz2
92
94
  - lib/wp2txt.rb
@@ -116,7 +118,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
116
118
  - !ruby/object:Gem::Version
117
119
  version: '0'
118
120
  requirements: []
119
- rubygems_version: 3.3.3
121
+ rubygems_version: 3.3.7
120
122
  signing_key:
121
123
  specification_version: 4
122
124
  summary: Wikipedia dump to text converter