wp2txt 0.9.2 → 0.9.5

Sign up to get free protection for your applications and to get access to all the features.
data/data/testdata_en.bz2 CHANGED
Binary file
data/data/testdata_ja.bz2 CHANGED
Binary file
@@ -37,10 +37,11 @@ module Wp2txt
37
37
  def initialize(text, title = "", strip_tmarker = false)
38
38
  @title = title.strip
39
39
  @strip_tmarker = strip_tmarker
40
- convert_characters!(text)
40
+ convert_characters!(text)
41
+ remove_html!(text)
42
+ remove_complex!(text)
41
43
  make_reference!(text)
42
44
  remove_ref!(text)
43
-
44
45
  parse text
45
46
  end
46
47
 
data/lib/wp2txt/utils.rb CHANGED
@@ -108,7 +108,7 @@ module Wp2txt
108
108
  process_interwiki_links!(text)
109
109
  process_external_links!(text)
110
110
  unescape_nowiki!(text)
111
- #####
111
+
112
112
  remove_directive!(text)
113
113
  remove_emphasis!(text)
114
114
  mndash!(text)
@@ -116,11 +116,15 @@ module Wp2txt
116
116
  remove_tag!(text)
117
117
  correct_inline_template!(text) unless $leave_inline_template
118
118
  remove_templates!(text) unless $leave_inline_template
119
- # remove_table!(text) unless $leave_table
119
+ remove_table!(text) unless $leave_table
120
120
  end
121
121
 
122
122
  def cleanup!(text)
123
123
  text.gsub!(/\[ref\]\s*\[\/ref\]/m){""}
124
+ text.gsub!(/^File:.+$/){""}
125
+ text.gsub!(/^\|.*$/){""}
126
+ text.gsub!(/^{{.*$/){""}
127
+ text.gsub!(/^}}.*$/){""}
124
128
  text.gsub!(/\n\n\n+/m){"\n\n"}
125
129
  text.strip!
126
130
  text << "\n\n"
@@ -128,8 +132,9 @@ module Wp2txt
128
132
  #################### parser for nested structure ####################
129
133
 
130
134
  def process_nested_structure(scanner, left, right, &block)
135
+ test = false
131
136
  buffer = ""
132
- begin
137
+ # begin
133
138
  if left == "[" && right == "]"
134
139
  regex = $single_square_bracket_regex
135
140
  elsif left == "[[" && right == "]]"
@@ -141,7 +146,7 @@ module Wp2txt
141
146
  elsif left == "{|" && right == "|}"
142
147
  regex = $curly_square_bracket_regex
143
148
  else
144
- regex = Regexp.new('(#{Regexp.escape(left)}|#{Regexp.escape(right)})', Regexp::MULTILINE)
149
+ regex = Regexp.new("(#{Regexp.escape(left)}|#{Regexp.escape(right)})")
145
150
  end
146
151
  while str = scanner.scan_until(regex)
147
152
  case scanner[1]
@@ -167,9 +172,9 @@ module Wp2txt
167
172
  scanner.string = buffer
168
173
  return process_nested_structure(scanner, left, right, &block) || ""
169
174
  end
170
- rescue => e
171
- return scanner.string
172
- end
175
+ # rescue => e
176
+ # return scanner.string
177
+ # end
173
178
  end
174
179
 
175
180
  #################### methods used from format_wiki ####################
@@ -234,6 +239,10 @@ module Wp2txt
234
239
  result = process_nested_structure(scanner, "{", "}") do |contents|
235
240
  ""
236
241
  end
242
+ scanner = StringScanner.new(result)
243
+ result = process_nested_structure(scanner, "{{", "}}") do |contents|
244
+ ""
245
+ end
237
246
  str.replace(result)
238
247
  end
239
248
 
@@ -299,6 +308,24 @@ module Wp2txt
299
308
  def remove_ref!(str)
300
309
  str.gsub!($format_ref_regex){""}
301
310
  end
311
+
312
+ def remove_html!(str)
313
+ ["div", "gallery", "timeline"].each do |tag|
314
+ scanner = StringScanner.new(str)
315
+ result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do |contents|
316
+ ""
317
+ end
318
+ str.replace(result)
319
+ end
320
+ end
321
+
322
+ def remove_complex!(str)
323
+ str.gsub!(/(?:'')?\[https?\:[^\[\]]+?\s([^\]]++)?\](?:'')?/){$1}
324
+ str.gsub!(/(?:'')?\[https?\:[^\[\]]++\](?:'')?\s?/){""}
325
+ str.gsub!(/\<\<([^<>]++)\>\>\s?/){"《#{$1}》"}
326
+ str.gsub!(/\{\{(?:Infobox|efn|Sfn|div col|no col|bar box|formatnum\:|Refnest\||Refnest\||Col\||See also\||R\|)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}/im){""}
327
+ str.gsub!(/\[\[(?:File|ファイル)\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]/im){""}
328
+ end
302
329
 
303
330
  def make_reference!(str)
304
331
  str.gsub!($make_reference_regex_a){"\n"}
@@ -311,30 +338,28 @@ module Wp2txt
311
338
  scanner = StringScanner.new(str)
312
339
  result = process_nested_structure(scanner, "{{", "}}") do |contents|
313
340
  parts = contents.split("|")
314
- # type_code = parts.first
315
- # case type_code
316
- # when $type_code_regex
317
- # out = parts[-1]
318
- # else
319
- # case parts.size
320
- # when 0
321
- # out = ""
322
- # when 1
323
- # out = parts.first || ""
324
- # else
325
- # while parts.size > 2 && parts.last.split("=").size > 1
326
- while parts.size > 1 && parts.last.split("=").size > 1
327
- parts.pop
328
- end
329
- out = parts.last || ""
330
- # end
331
- # end
341
+ if /\A(?:lang|fontsize)\z/i =~ parts[0]
342
+ parts.shift
343
+ elsif /\Alang=/i =~ parts[1]
344
+ parts.shift
345
+ end
346
+
347
+ if parts.size == 1
348
+ out = parts[0]
349
+ else
350
+ keyval = parts[1].split("=")
351
+ if keyval.size > 1
352
+ out = keyval[1]
353
+ else
354
+ out = parts[1] || ""
355
+ end
356
+ end
357
+
332
358
  out.strip
333
359
  end
334
360
  str.replace result
335
361
  end
336
362
 
337
-
338
363
  #################### file related utilities ####################
339
364
 
340
365
  # collect filenames recursively
@@ -427,5 +452,4 @@ module Wp2txt
427
452
  str = i.to_s.reverse
428
453
  return str.scan(/.?.?./).join(',').reverse
429
454
  end
430
-
431
455
  end
@@ -1,3 +1,3 @@
1
1
  module Wp2txt
2
- VERSION = "0.9.2"
2
+ VERSION = "0.9.5"
3
3
  end
data/lib/wp2txt.rb CHANGED
@@ -101,7 +101,7 @@ module Wp2txt
101
101
  if /.bz2$/ =~ @input_file
102
102
  unless NO_BZ2
103
103
  file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
104
- @parent.msg("WP2TXT is spawming #{@num_threads} threads to process data \n", 0)
104
+ @parent.msg("WP2TXT is spawning #{@num_threads} threads to process data \n", 0)
105
105
  @parent.msg("Preparing ... This may take several minutes or more ", 0)
106
106
  @infile_size = file_size(file)
107
107
  @parent.msg("... Done.", 1)
@@ -113,7 +113,7 @@ module Wp2txt
113
113
  else
114
114
  file = IO.popen("bzip2 -c -d #{@input_file}")
115
115
  end
116
- @parent.msg("WP2TXT is spawming #{@num_threads} threads to process data \n", 0)
116
+ @parent.msg("WP2TXT is spawning #{@num_threads} threads to process data \n", 0)
117
117
  @parent.msg("Preparing ... This may take several minutes or more ", 0)
118
118
  @infile_size = file_size(file)
119
119
  @parent.msg("... Done.", 1)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wp2txt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.2
4
+ version: 0.9.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-07-25 00:00:00.000000000 Z
11
+ date: 2022-08-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -84,7 +84,11 @@ files:
84
84
  - bin/benchmark.rb
85
85
  - bin/wp2txt
86
86
  - data/output_samples/testdata_en.txt
87
+ - data/output_samples/testdata_en_categories.txt
88
+ - data/output_samples/testdata_en_summary.txt
87
89
  - data/output_samples/testdata_ja.txt
90
+ - data/output_samples/testdata_ja_categories.txt
91
+ - data/output_samples/testdata_ja_summary.txt
88
92
  - data/testdata_en.bz2
89
93
  - data/testdata_ja.bz2
90
94
  - lib/wp2txt.rb
@@ -114,7 +118,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
114
118
  - !ruby/object:Gem::Version
115
119
  version: '0'
116
120
  requirements: []
117
- rubygems_version: 3.3.3
121
+ rubygems_version: 3.3.7
118
122
  signing_key:
119
123
  specification_version: 4
120
124
  summary: Wikipedia dump to text converter