wp2txt 0.9.5 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/wp2txt/utils.rb CHANGED
@@ -77,6 +77,22 @@ $double_square_bracket_regex = Regexp.new("(#{Regexp.escape('[[')}|#{Regexp.esca
77
77
  $single_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{')}|#{Regexp.escape('}')})", Regexp::MULTILINE)
78
78
  $double_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{{')}|#{Regexp.escape('}}')})", Regexp::MULTILINE)
79
79
  $curly_square_bracket_regex = Regexp.new("(#{Regexp.escape('{|')}|#{Regexp.escape('|}')})", Regexp::MULTILINE)
80
+
81
+ $complex_regex_01 = Regexp.new('\<\<([^<>]++)\>\>\s?')
82
+ $complex_regex_02 = Regexp.new('\[\[File\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]', Regexp::MULTILINE | Regexp::IGNORECASE)
83
+ $complex_regex_03 = Regexp.new('^\[\[((?:[^\[\]]++|\[\[\g<1>\]\])++)^\]\]', Regexp::MULTILINE)
84
+ $complex_regex_04 = Regexp.new('\{\{(?:infobox|efn|sfn|unreliable source|refn|reflist|col(?:umns)?\-list|div col|no col|bar box|formatnum\:|col\||see also\||r\||#)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
85
+ $complex_regex_05 = Regexp.new('\{\{[^{}]+?\n\|((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
86
+
87
+ $cleanup_regex_01 = Regexp.new('\[ref\]\s*\[\/ref\]', Regexp::MULTILINE)
88
+ $cleanup_regex_02 = Regexp.new('^File:.+$')
89
+ $cleanup_regex_03 = Regexp.new('^\|.*$')
90
+ $cleanup_regex_04 = Regexp.new('\{\{.*$')
91
+ $cleanup_regex_05 = Regexp.new('^.*\}\}')
92
+ $cleanup_regex_06 = Regexp.new('\{\|.*$')
93
+ $cleanup_regex_07 = Regexp.new('^.*\|\}')
94
+ $cleanup_regex_08 = Regexp.new('\n\n\n+', Regexp::MULTILINE)
95
+
80
96
  ###################################################
81
97
 
82
98
  module Wp2txt
@@ -104,11 +120,12 @@ module Wp2txt
104
120
  end
105
121
 
106
122
  def format_wiki!(text, has_retried = false)
123
+ remove_complex!(text)
124
+
107
125
  escape_nowiki!(text)
108
126
  process_interwiki_links!(text)
109
127
  process_external_links!(text)
110
128
  unescape_nowiki!(text)
111
-
112
129
  remove_directive!(text)
113
130
  remove_emphasis!(text)
114
131
  mndash!(text)
@@ -120,61 +137,64 @@ module Wp2txt
120
137
  end
121
138
 
122
139
  def cleanup!(text)
123
- text.gsub!(/\[ref\]\s*\[\/ref\]/m){""}
124
- text.gsub!(/^File:.+$/){""}
125
- text.gsub!(/^\|.*$/){""}
126
- text.gsub!(/^{{.*$/){""}
127
- text.gsub!(/^}}.*$/){""}
128
- text.gsub!(/\n\n\n+/m){"\n\n"}
140
+ text.gsub!($cleanup_regex_01){""}
141
+ text.gsub!($cleanup_regex_02){""}
142
+ text.gsub!($cleanup_regex_03){""}
143
+ text.gsub!($cleanup_regex_04){""}
144
+ text.gsub!($cleanup_regex_05){""}
145
+ text.gsub!($cleanup_regex_06){""}
146
+ text.gsub!($cleanup_regex_07){""}
147
+ text.gsub!($cleanup_regex_08){"\n\n"}
129
148
  text.strip!
130
149
  text << "\n\n"
131
150
  end
151
+
132
152
  #################### parser for nested structure ####################
133
153
 
134
154
  def process_nested_structure(scanner, left, right, &block)
135
155
  test = false
136
156
  buffer = ""
137
- # begin
138
- if left == "[" && right == "]"
139
- regex = $single_square_bracket_regex
140
- elsif left == "[[" && right == "]]"
141
- regex = $double_square_bracket_regex
142
- elsif left == "{" && right == "}"
143
- regex = $single_curly_bracket_regex
144
- elsif left == "{{" && right == "}}"
145
- regex = $double_curly_bracket_regex
146
- elsif left == "{|" && right == "|}"
147
- regex = $curly_square_bracket_regex
148
- else
149
- regex = Regexp.new("(#{Regexp.escape(left)}|#{Regexp.escape(right)})")
150
- end
151
- while str = scanner.scan_until(regex)
152
- case scanner[1]
153
- when left
154
- buffer << str
155
- has_left = true
156
- when right
157
- if has_left
158
- buffer = buffer[0...-(left.size)]
159
- contents = block.call(str[0...-(left.size)])
160
- buffer << contents
161
- break
162
- else
157
+ begin
158
+ if left == "[" && right == "]"
159
+ regex = $single_square_bracket_regex
160
+ elsif left == "[[" && right == "]]"
161
+ regex = $double_square_bracket_regex
162
+ elsif left == "{" && right == "}"
163
+ regex = $single_curly_bracket_regex
164
+ elsif left == "{{" && right == "}}"
165
+ regex = $double_curly_bracket_regex
166
+ elsif left == "{|" && right == "|}"
167
+ regex = $curly_square_bracket_regex
168
+ else
169
+ regex = Regexp.new("(#{Regexp.escape(left)}|#{Regexp.escape(right)})")
170
+ end
171
+ while str = scanner.scan_until(regex)
172
+ case scanner[1]
173
+ when left
163
174
  buffer << str
175
+ has_left = true
176
+ when right
177
+ if has_left
178
+ buffer = buffer[0...-(left.size)]
179
+ contents = block.call(str[0...-(left.size)])
180
+ buffer << contents
181
+ break
182
+ else
183
+ buffer << str
184
+ end
164
185
  end
165
186
  end
166
- end
167
- buffer << scanner.rest
187
+ buffer << scanner.rest
168
188
 
169
- if buffer == scanner.string
170
- return buffer
171
- else
172
- scanner.string = buffer
173
- return process_nested_structure(scanner, left, right, &block) || ""
189
+ if buffer == scanner.string
190
+ return buffer
191
+ else
192
+ scanner.string = buffer
193
+ return process_nested_structure(scanner, left, right, &block) || ""
194
+ end
195
+ rescue => e
196
+ return scanner.string
174
197
  end
175
- # rescue => e
176
- # return scanner.string
177
- # end
178
198
  end
179
199
 
180
200
  #################### methods used from format_wiki ####################
@@ -217,12 +237,16 @@ module Wp2txt
217
237
  def process_external_links!(str)
218
238
  scanner = StringScanner.new(str)
219
239
  result = process_nested_structure(scanner, "[", "]") do |contents|
220
- parts = contents.split(" ", 2)
221
- case parts.size
222
- when 1
223
- parts.first || ""
240
+ if /\A\s.+\s\z/ =~ contents
241
+ " (#{contents.strip}) "
224
242
  else
225
- parts.last || ""
243
+ parts = contents.split(" ", 2)
244
+ case parts.size
245
+ when 1
246
+ parts.first || ""
247
+ else
248
+ parts.last || ""
249
+ end
226
250
  end
227
251
  end
228
252
  str.replace(result)
@@ -239,10 +263,6 @@ module Wp2txt
239
263
  result = process_nested_structure(scanner, "{", "}") do |contents|
240
264
  ""
241
265
  end
242
- scanner = StringScanner.new(result)
243
- result = process_nested_structure(scanner, "{{", "}}") do |contents|
244
- ""
245
- end
246
266
  str.replace(result)
247
267
  end
248
268
 
@@ -310,7 +330,8 @@ module Wp2txt
310
330
  end
311
331
 
312
332
  def remove_html!(str)
313
- ["div", "gallery", "timeline"].each do |tag|
333
+ str.gsub!(/<[^<>]+\/>/){""}
334
+ ["div", "gallery", "timeline", "noinclude"].each do |tag|
314
335
  scanner = StringScanner.new(str)
315
336
  result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do |contents|
316
337
  ""
@@ -320,11 +341,11 @@ module Wp2txt
320
341
  end
321
342
 
322
343
  def remove_complex!(str)
323
- str.gsub!(/(?:'')?\[https?\:[^\[\]]+?\s([^\]]++)?\](?:'')?/){$1}
324
- str.gsub!(/(?:'')?\[https?\:[^\[\]]++\](?:'')?\s?/){""}
325
- str.gsub!(/\<\<([^<>]++)\>\>\s?/){"《#{$1}》"}
326
- str.gsub!(/\{\{(?:Infobox|efn|Sfn|div col|no col|bar box|formatnum\:|Refnest\||Refnest\||Col\||See also\||R\|)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}/im){""}
327
- str.gsub!(/\[\[(?:File|ファイル)\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]/im){""}
344
+ str.gsub!($complex_regex_01){"《#{$1}》"}
345
+ str.gsub!($complex_regex_02){""}
346
+ str.gsub!($complex_regex_03){""}
347
+ str.gsub!($complex_regex_04){""}
348
+ str.gsub!($complex_regex_05){""}
328
349
  end
329
350
 
330
351
  def make_reference!(str)
@@ -340,6 +361,8 @@ module Wp2txt
340
361
  parts = contents.split("|")
341
362
  if /\A(?:lang|fontsize)\z/i =~ parts[0]
342
363
  parts.shift
364
+ elsif /\Alang\-/i =~ parts[0]
365
+ parts.shift
343
366
  elsif /\Alang=/i =~ parts[1]
344
367
  parts.shift
345
368
  end
@@ -347,10 +370,14 @@ module Wp2txt
347
370
  if parts.size == 1
348
371
  out = parts[0]
349
372
  else
350
- keyval = parts[1].split("=")
351
- if keyval.size > 1
352
- out = keyval[1]
353
- else
373
+ begin
374
+ keyval = parts[1].split("=")
375
+ if keyval.size > 1
376
+ out = keyval[1]
377
+ else
378
+ out = parts[1] || ""
379
+ end
380
+ rescue
354
381
  out = parts[1] || ""
355
382
  end
356
383
  end
@@ -418,7 +445,7 @@ module Wp2txt
418
445
  end
419
446
  end
420
447
 
421
- def rename(files)
448
+ def rename(files, ext = "txt")
422
449
  # num of digits necessary to name the last file generated
423
450
  maxwidth = 0
424
451
 
@@ -431,8 +458,9 @@ module Wp2txt
431
458
  newname= f.sub(/\-(\d+)\z/) do
432
459
  "-" + sprintf("%0#{maxwidth}d", $1.to_i)
433
460
  end
434
- File.rename(f, newname + ".txt")
461
+ File.rename(f, newname + ".#{ext}")
435
462
  end
463
+ return true
436
464
  end
437
465
 
438
466
  # convert int of seconds to string in the format 00:00:00
@@ -448,8 +476,4 @@ module Wp2txt
448
476
  return str
449
477
  end
450
478
 
451
- def decimal_format(i)
452
- str = i.to_s.reverse
453
- return str.scan(/.?.?./).join(',').reverse
454
- end
455
479
  end
@@ -1,3 +1,3 @@
1
1
  module Wp2txt
2
- VERSION = "0.9.5"
2
+ VERSION = "1.0.1"
3
3
  end