wp2txt 0.9.5 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/lib/wp2txt/utils.rb CHANGED
@@ -77,6 +77,22 @@ $double_square_bracket_regex = Regexp.new("(#{Regexp.escape('[[')}|#{Regexp.esca
77
77
  $single_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{')}|#{Regexp.escape('}')})", Regexp::MULTILINE)
78
78
  $double_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{{')}|#{Regexp.escape('}}')})", Regexp::MULTILINE)
79
79
  $curly_square_bracket_regex = Regexp.new("(#{Regexp.escape('{|')}|#{Regexp.escape('|}')})", Regexp::MULTILINE)
80
+
81
+ $complex_regex_01 = Regexp.new('\<\<([^<>]++)\>\>\s?')
82
+ $complex_regex_02 = Regexp.new('\[\[File\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]', Regexp::MULTILINE | Regexp::IGNORECASE)
83
+ $complex_regex_03 = Regexp.new('^\[\[((?:[^\[\]]++|\[\[\g<1>\]\])++)^\]\]', Regexp::MULTILINE)
84
+ $complex_regex_04 = Regexp.new('\{\{(?:infobox|efn|sfn|unreliable source|refn|reflist|col(?:umns)?\-list|div col|no col|bar box|formatnum\:|col\||see also\||r\||#)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
85
+ $complex_regex_05 = Regexp.new('\{\{[^{}]+?\n\|((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
86
+
87
+ $cleanup_regex_01 = Regexp.new('\[ref\]\s*\[\/ref\]', Regexp::MULTILINE)
88
+ $cleanup_regex_02 = Regexp.new('^File:.+$')
89
+ $cleanup_regex_03 = Regexp.new('^\|.*$')
90
+ $cleanup_regex_04 = Regexp.new('\{\{.*$')
91
+ $cleanup_regex_05 = Regexp.new('^.*\}\}')
92
+ $cleanup_regex_06 = Regexp.new('\{\|.*$')
93
+ $cleanup_regex_07 = Regexp.new('^.*\|\}')
94
+ $cleanup_regex_08 = Regexp.new('\n\n\n+', Regexp::MULTILINE)
95
+
80
96
  ###################################################
81
97
 
82
98
  module Wp2txt
@@ -104,11 +120,12 @@ module Wp2txt
104
120
  end
105
121
 
106
122
  def format_wiki!(text, has_retried = false)
123
+ remove_complex!(text)
124
+
107
125
  escape_nowiki!(text)
108
126
  process_interwiki_links!(text)
109
127
  process_external_links!(text)
110
128
  unescape_nowiki!(text)
111
-
112
129
  remove_directive!(text)
113
130
  remove_emphasis!(text)
114
131
  mndash!(text)
@@ -120,61 +137,64 @@ module Wp2txt
120
137
  end
121
138
 
122
139
  def cleanup!(text)
123
- text.gsub!(/\[ref\]\s*\[\/ref\]/m){""}
124
- text.gsub!(/^File:.+$/){""}
125
- text.gsub!(/^\|.*$/){""}
126
- text.gsub!(/^{{.*$/){""}
127
- text.gsub!(/^}}.*$/){""}
128
- text.gsub!(/\n\n\n+/m){"\n\n"}
140
+ text.gsub!($cleanup_regex_01){""}
141
+ text.gsub!($cleanup_regex_02){""}
142
+ text.gsub!($cleanup_regex_03){""}
143
+ text.gsub!($cleanup_regex_04){""}
144
+ text.gsub!($cleanup_regex_05){""}
145
+ text.gsub!($cleanup_regex_06){""}
146
+ text.gsub!($cleanup_regex_07){""}
147
+ text.gsub!($cleanup_regex_08){"\n\n"}
129
148
  text.strip!
130
149
  text << "\n\n"
131
150
  end
151
+
132
152
  #################### parser for nested structure ####################
133
153
 
134
154
  def process_nested_structure(scanner, left, right, &block)
135
155
  test = false
136
156
  buffer = ""
137
- # begin
138
- if left == "[" && right == "]"
139
- regex = $single_square_bracket_regex
140
- elsif left == "[[" && right == "]]"
141
- regex = $double_square_bracket_regex
142
- elsif left == "{" && right == "}"
143
- regex = $single_curly_bracket_regex
144
- elsif left == "{{" && right == "}}"
145
- regex = $double_curly_bracket_regex
146
- elsif left == "{|" && right == "|}"
147
- regex = $curly_square_bracket_regex
148
- else
149
- regex = Regexp.new("(#{Regexp.escape(left)}|#{Regexp.escape(right)})")
150
- end
151
- while str = scanner.scan_until(regex)
152
- case scanner[1]
153
- when left
154
- buffer << str
155
- has_left = true
156
- when right
157
- if has_left
158
- buffer = buffer[0...-(left.size)]
159
- contents = block.call(str[0...-(left.size)])
160
- buffer << contents
161
- break
162
- else
157
+ begin
158
+ if left == "[" && right == "]"
159
+ regex = $single_square_bracket_regex
160
+ elsif left == "[[" && right == "]]"
161
+ regex = $double_square_bracket_regex
162
+ elsif left == "{" && right == "}"
163
+ regex = $single_curly_bracket_regex
164
+ elsif left == "{{" && right == "}}"
165
+ regex = $double_curly_bracket_regex
166
+ elsif left == "{|" && right == "|}"
167
+ regex = $curly_square_bracket_regex
168
+ else
169
+ regex = Regexp.new("(#{Regexp.escape(left)}|#{Regexp.escape(right)})")
170
+ end
171
+ while str = scanner.scan_until(regex)
172
+ case scanner[1]
173
+ when left
163
174
  buffer << str
175
+ has_left = true
176
+ when right
177
+ if has_left
178
+ buffer = buffer[0...-(left.size)]
179
+ contents = block.call(str[0...-(left.size)])
180
+ buffer << contents
181
+ break
182
+ else
183
+ buffer << str
184
+ end
164
185
  end
165
186
  end
166
- end
167
- buffer << scanner.rest
187
+ buffer << scanner.rest
168
188
 
169
- if buffer == scanner.string
170
- return buffer
171
- else
172
- scanner.string = buffer
173
- return process_nested_structure(scanner, left, right, &block) || ""
189
+ if buffer == scanner.string
190
+ return buffer
191
+ else
192
+ scanner.string = buffer
193
+ return process_nested_structure(scanner, left, right, &block) || ""
194
+ end
195
+ rescue => e
196
+ return scanner.string
174
197
  end
175
- # rescue => e
176
- # return scanner.string
177
- # end
178
198
  end
179
199
 
180
200
  #################### methods used from format_wiki ####################
@@ -217,12 +237,16 @@ module Wp2txt
217
237
  def process_external_links!(str)
218
238
  scanner = StringScanner.new(str)
219
239
  result = process_nested_structure(scanner, "[", "]") do |contents|
220
- parts = contents.split(" ", 2)
221
- case parts.size
222
- when 1
223
- parts.first || ""
240
+ if /\A\s.+\s\z/ =~ contents
241
+ " (#{contents.strip}) "
224
242
  else
225
- parts.last || ""
243
+ parts = contents.split(" ", 2)
244
+ case parts.size
245
+ when 1
246
+ parts.first || ""
247
+ else
248
+ parts.last || ""
249
+ end
226
250
  end
227
251
  end
228
252
  str.replace(result)
@@ -239,10 +263,6 @@ module Wp2txt
239
263
  result = process_nested_structure(scanner, "{", "}") do |contents|
240
264
  ""
241
265
  end
242
- scanner = StringScanner.new(result)
243
- result = process_nested_structure(scanner, "{{", "}}") do |contents|
244
- ""
245
- end
246
266
  str.replace(result)
247
267
  end
248
268
 
@@ -310,7 +330,8 @@ module Wp2txt
310
330
  end
311
331
 
312
332
  def remove_html!(str)
313
- ["div", "gallery", "timeline"].each do |tag|
333
+ str.gsub!(/<[^<>]+\/>/){""}
334
+ ["div", "gallery", "timeline", "noinclude"].each do |tag|
314
335
  scanner = StringScanner.new(str)
315
336
  result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do |contents|
316
337
  ""
@@ -320,11 +341,11 @@ module Wp2txt
320
341
  end
321
342
 
322
343
  def remove_complex!(str)
323
- str.gsub!(/(?:'')?\[https?\:[^\[\]]+?\s([^\]]++)?\](?:'')?/){$1}
324
- str.gsub!(/(?:'')?\[https?\:[^\[\]]++\](?:'')?\s?/){""}
325
- str.gsub!(/\<\<([^<>]++)\>\>\s?/){"《#{$1}》"}
326
- str.gsub!(/\{\{(?:Infobox|efn|Sfn|div col|no col|bar box|formatnum\:|Refnest\||Refnest\||Col\||See also\||R\|)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}/im){""}
327
- str.gsub!(/\[\[(?:File|ファイル)\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]/im){""}
344
+ str.gsub!($complex_regex_01){"《#{$1}》"}
345
+ str.gsub!($complex_regex_02){""}
346
+ str.gsub!($complex_regex_03){""}
347
+ str.gsub!($complex_regex_04){""}
348
+ str.gsub!($complex_regex_05){""}
328
349
  end
329
350
 
330
351
  def make_reference!(str)
@@ -340,6 +361,8 @@ module Wp2txt
340
361
  parts = contents.split("|")
341
362
  if /\A(?:lang|fontsize)\z/i =~ parts[0]
342
363
  parts.shift
364
+ elsif /\Alang\-/i =~ parts[0]
365
+ parts.shift
343
366
  elsif /\Alang=/i =~ parts[1]
344
367
  parts.shift
345
368
  end
@@ -347,10 +370,14 @@ module Wp2txt
347
370
  if parts.size == 1
348
371
  out = parts[0]
349
372
  else
350
- keyval = parts[1].split("=")
351
- if keyval.size > 1
352
- out = keyval[1]
353
- else
373
+ begin
374
+ keyval = parts[1].split("=")
375
+ if keyval.size > 1
376
+ out = keyval[1]
377
+ else
378
+ out = parts[1] || ""
379
+ end
380
+ rescue
354
381
  out = parts[1] || ""
355
382
  end
356
383
  end
@@ -418,7 +445,7 @@ module Wp2txt
418
445
  end
419
446
  end
420
447
 
421
- def rename(files)
448
+ def rename(files, ext = "txt")
422
449
  # num of digits necessary to name the last file generated
423
450
  maxwidth = 0
424
451
 
@@ -431,8 +458,9 @@ module Wp2txt
431
458
  newname= f.sub(/\-(\d+)\z/) do
432
459
  "-" + sprintf("%0#{maxwidth}d", $1.to_i)
433
460
  end
434
- File.rename(f, newname + ".txt")
461
+ File.rename(f, newname + ".#{ext}")
435
462
  end
463
+ return true
436
464
  end
437
465
 
438
466
  # convert int of seconds to string in the format 00:00:00
@@ -448,8 +476,4 @@ module Wp2txt
448
476
  return str
449
477
  end
450
478
 
451
- def decimal_format(i)
452
- str = i.to_s.reverse
453
- return str.scan(/.?.?./).join(',').reverse
454
- end
455
479
  end
@@ -1,3 +1,3 @@
1
1
  module Wp2txt
2
- VERSION = "0.9.5"
2
+ VERSION = "1.0.1"
3
3
  end