wp2txt 0.9.4 → 0.9.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/data/testdata_en.bz2 CHANGED
Binary file
data/data/testdata_ja.bz2 CHANGED
Binary file
@@ -37,10 +37,11 @@ module Wp2txt
37
37
  def initialize(text, title = "", strip_tmarker = false)
38
38
  @title = title.strip
39
39
  @strip_tmarker = strip_tmarker
40
- convert_characters!(text)
40
+ convert_characters!(text)
41
+ remove_html!(text)
42
+ remove_complex!(text)
41
43
  make_reference!(text)
42
44
  remove_ref!(text)
43
-
44
45
  parse text
45
46
  end
46
47
 
data/lib/wp2txt/utils.rb CHANGED
@@ -108,7 +108,7 @@ module Wp2txt
108
108
  process_interwiki_links!(text)
109
109
  process_external_links!(text)
110
110
  unescape_nowiki!(text)
111
- #####
111
+
112
112
  remove_directive!(text)
113
113
  remove_emphasis!(text)
114
114
  mndash!(text)
@@ -116,11 +116,15 @@ module Wp2txt
116
116
  remove_tag!(text)
117
117
  correct_inline_template!(text) unless $leave_inline_template
118
118
  remove_templates!(text) unless $leave_inline_template
119
- # remove_table!(text) unless $leave_table
119
+ remove_table!(text) unless $leave_table
120
120
  end
121
121
 
122
122
  def cleanup!(text)
123
123
  text.gsub!(/\[ref\]\s*\[\/ref\]/m){""}
124
+ text.gsub!(/^File:.+$/){""}
125
+ text.gsub!(/^\|.*$/){""}
126
+ text.gsub!(/^{{.*$/){""}
127
+ text.gsub!(/^}}.*$/){""}
124
128
  text.gsub!(/\n\n\n+/m){"\n\n"}
125
129
  text.strip!
126
130
  text << "\n\n"
@@ -128,8 +132,9 @@ module Wp2txt
128
132
  #################### parser for nested structure ####################
129
133
 
130
134
  def process_nested_structure(scanner, left, right, &block)
135
+ test = false
131
136
  buffer = ""
132
- begin
137
+ # begin
133
138
  if left == "[" && right == "]"
134
139
  regex = $single_square_bracket_regex
135
140
  elsif left == "[[" && right == "]]"
@@ -141,7 +146,7 @@ module Wp2txt
141
146
  elsif left == "{|" && right == "|}"
142
147
  regex = $curly_square_bracket_regex
143
148
  else
144
- regex = Regexp.new('(#{Regexp.escape(left)}|#{Regexp.escape(right)})', Regexp::MULTILINE)
149
+ regex = Regexp.new("(#{Regexp.escape(left)}|#{Regexp.escape(right)})")
145
150
  end
146
151
  while str = scanner.scan_until(regex)
147
152
  case scanner[1]
@@ -167,9 +172,9 @@ module Wp2txt
167
172
  scanner.string = buffer
168
173
  return process_nested_structure(scanner, left, right, &block) || ""
169
174
  end
170
- rescue => e
171
- return scanner.string
172
- end
175
+ # rescue => e
176
+ # return scanner.string
177
+ # end
173
178
  end
174
179
 
175
180
  #################### methods used from format_wiki ####################
@@ -234,6 +239,10 @@ module Wp2txt
234
239
  result = process_nested_structure(scanner, "{", "}") do |contents|
235
240
  ""
236
241
  end
242
+ scanner = StringScanner.new(result)
243
+ result = process_nested_structure(scanner, "{{", "}}") do |contents|
244
+ ""
245
+ end
237
246
  str.replace(result)
238
247
  end
239
248
 
@@ -299,6 +308,24 @@ module Wp2txt
299
308
  def remove_ref!(str)
300
309
  str.gsub!($format_ref_regex){""}
301
310
  end
311
+
312
+ def remove_html!(str)
313
+ ["div", "gallery", "timeline"].each do |tag|
314
+ scanner = StringScanner.new(str)
315
+ result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do |contents|
316
+ ""
317
+ end
318
+ str.replace(result)
319
+ end
320
+ end
321
+
322
+ def remove_complex!(str)
323
+ str.gsub!(/(?:'')?\[https?\:[^\[\]]+?\s([^\]]++)?\](?:'')?/){$1}
324
+ str.gsub!(/(?:'')?\[https?\:[^\[\]]++\](?:'')?\s?/){""}
325
+ str.gsub!(/\<\<([^<>]++)\>\>\s?/){"《#{$1}》"}
326
+ str.gsub!(/\{\{(?:Infobox|efn|Sfn|div col|no col|bar box|formatnum\:|Refnest\||Refnest\||Col\||See also\||R\|)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}/im){""}
327
+ str.gsub!(/\[\[(?:File|ファイル)\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]/im){""}
328
+ end
302
329
 
303
330
  def make_reference!(str)
304
331
  str.gsub!($make_reference_regex_a){"\n"}
@@ -311,30 +338,28 @@ module Wp2txt
311
338
  scanner = StringScanner.new(str)
312
339
  result = process_nested_structure(scanner, "{{", "}}") do |contents|
313
340
  parts = contents.split("|")
314
- # type_code = parts.first
315
- # case type_code
316
- # when $type_code_regex
317
- # out = parts[-1]
318
- # else
319
- # case parts.size
320
- # when 0
321
- # out = ""
322
- # when 1
323
- # out = parts.first || ""
324
- # else
325
- # while parts.size > 2 && parts.last.split("=").size > 1
326
- while parts.size > 1 && parts.last.split("=").size > 1
327
- parts.pop
328
- end
329
- out = parts.last || ""
330
- # end
331
- # end
341
+ if /\A(?:lang|fontsize)\z/i =~ parts[0]
342
+ parts.shift
343
+ elsif /\Alang=/i =~ parts[1]
344
+ parts.shift
345
+ end
346
+
347
+ if parts.size == 1
348
+ out = parts[0]
349
+ else
350
+ keyval = parts[1].split("=")
351
+ if keyval.size > 1
352
+ out = keyval[1]
353
+ else
354
+ out = parts[1] || ""
355
+ end
356
+ end
357
+
332
358
  out.strip
333
359
  end
334
360
  str.replace result
335
361
  end
336
362
 
337
-
338
363
  #################### file related utilities ####################
339
364
 
340
365
  # collect filenames recursively
@@ -427,5 +452,4 @@ module Wp2txt
427
452
  str = i.to_s.reverse
428
453
  return str.scan(/.?.?./).join(',').reverse
429
454
  end
430
-
431
455
  end
@@ -1,3 +1,3 @@
1
1
  module Wp2txt
2
- VERSION = "0.9.4"
2
+ VERSION = "0.9.5"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wp2txt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.4
4
+ version: 0.9.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-07-29 00:00:00.000000000 Z
11
+ date: 2022-08-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -85,8 +85,10 @@ files:
85
85
  - bin/wp2txt
86
86
  - data/output_samples/testdata_en.txt
87
87
  - data/output_samples/testdata_en_categories.txt
88
+ - data/output_samples/testdata_en_summary.txt
88
89
  - data/output_samples/testdata_ja.txt
89
90
  - data/output_samples/testdata_ja_categories.txt
91
+ - data/output_samples/testdata_ja_summary.txt
90
92
  - data/testdata_en.bz2
91
93
  - data/testdata_ja.bz2
92
94
  - lib/wp2txt.rb