wp2txt 1.0.1 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/wp2txt/utils.rb CHANGED
@@ -1,182 +1,87 @@
1
- #!/usr/bin/env ruby
2
- # -*- coding: utf-8 -*-
3
-
4
- require 'strscan'
5
- require 'find'
6
- require 'htmlentities'
7
-
8
- ###################################################
9
- # global variables to save resource for generating regexps
10
- # those with a trailing number 1 represent opening tag/markup
11
- # those with a trailing number 2 represent closing tag/markup
12
- # those without a trailing number contain both opening/closing tags/markups
13
-
14
- $html_decoder = HTMLEntities.new
15
-
16
- $entities = ['&nbsp;', '&lt;', '&gt;', '&amp;', '&quot;'].zip([' ', '<', '>', '&', '"'])
17
- $html_hash = Hash[*$entities.flatten]
18
- $html_regex = Regexp.new("(" + $html_hash.keys.join("|") + ")")
19
- $ml_template_onset_regex = Regexp.new('^\{\{[^\}]*$')
20
- $ml_template_end_regex = Regexp.new('\}\}\s*$')
21
- $ml_link_onset_regex = Regexp.new('^\[\[[^\]]*$')
22
- $ml_linkend_regex = Regexp.new('\]\]\s*$')
23
- $isolated_template_regex = Regexp.new('^\s*\{\{.+\}\}\s*$')
24
- $isolated_tag_regex = Regexp.new('^\s*\<[^\<\>]+\>.+\<[^\<\>]+\>\s*$')
25
- $in_link_regex = Regexp.new('^\s*\[.*\]\s*$')
26
- $in_inputbox_regex = Regexp.new('<inputbox>.*?<\/inputbox>')
27
- $in_inputbox_regex1 = Regexp.new('<inputbox>')
28
- $in_inputbox_regex2 = Regexp.new('<\/inputbox>')
29
- $in_source_regex = Regexp.new('<source.*?>.*?<\/source>')
30
- $in_source_regex1 = Regexp.new('<source.*?>')
31
- $in_source_regex2 = Regexp.new('<\/source>')
32
- $in_math_regex = Regexp.new('<math.*?>.*?<\/math>')
33
- $in_math_regex1 = Regexp.new('<math.*?>')
34
- $in_math_regex2 = Regexp.new('<\/math>')
35
- $in_heading_regex = Regexp.new('^=+.*?=+$')
36
- $in_html_table_regex = Regexp.new('<table.*?><\/table>')
37
- $in_html_table_regex1 = Regexp.new('<table\b')
38
- $in_html_table_regex2 = Regexp.new('<\/\s*table>')
39
- $in_table_regex1 = Regexp.new('^\s*\{\|')
40
- $in_table_regex2 = Regexp.new('^\|\}.*?$')
41
- $in_unordered_regex = Regexp.new('^\*')
42
- $in_ordered_regex = Regexp.new('^\#')
43
- $in_pre_regex = Regexp.new('^ ')
44
- $in_definition_regex = Regexp.new('^[\;\:]')
45
- $blank_line_regex = Regexp.new('^\s*$')
46
- $redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
47
- $remove_tag_regex = Regexp.new("\<[^\<\>]*\>")
48
- $remove_directives_regex = Regexp.new("\_\_[^\_]*\_\_")
49
- $remove_emphasis_regex = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
50
- $chrref_to_utf_regex = Regexp.new('&#(x?)([0-9a-fA-F]+);')
51
- $mndash_regex = Regexp.new('\{(mdash|ndash|–)\}')
52
- $remove_hr_regex = Regexp.new('^\s*\-+\s*$')
53
- $make_reference_regex_a = Regexp.new('<br ?\/>')
54
- $make_reference_regex_b = Regexp.new('<ref[^>]*\/>')
55
- $make_reference_regex_c = Regexp.new('<ref[^>]*>')
56
- $make_reference_regex_d = Regexp.new('<\/ref>')
57
- $format_ref_regex = Regexp.new('\[ref\](.*?)\[\/ref\]', Regexp::MULTILINE)
58
- $heading_onset_regex = Regexp.new('^(\=+)\s+')
59
- $heading_coda_regex = Regexp.new('\s+(\=+)$')
60
- $list_marks_regex = Regexp.new('\A[\*\#\;\:\ ]+')
61
- $pre_marks_regex = Regexp.new('\A\^\ ')
62
- $def_marks_regex = Regexp.new('\A[\;\:\ ]+')
63
- $onset_bar_regex = Regexp.new('\A[^\|]+\z')
64
-
65
- $category_patterns = ["Category", "Categoria"].join("|")
66
- $category_regex = Regexp.new('[\{\[\|\b](?:' + $category_patterns + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
67
-
68
- $escape_nowiki_regex = Regexp.new('<nowiki>(.*?)<\/nowiki>', Regexp::MULTILINE)
69
- $unescape_nowiki_regex = Regexp.new('<nowiki\-(\d+?)>')
70
-
71
- $remove_isolated_regex = Regexp.new('^\s*\{\{(.*?)\}\}\s*$')
72
- $remove_inline_regex = Regexp.new('\{\{(.*?)\}\}')
73
- $type_code_regex = Regexp.new('\A(?:lang*|\AIPA|IEP|SEP|indent|audio|small|dmoz|pron|unicode|note label|nowrap|ArabDIN|trans|Nihongo|Polytonic)', Regexp::IGNORECASE)
74
-
75
- $single_square_bracket_regex = Regexp.new("(#{Regexp.escape('[')}|#{Regexp.escape(']')})", Regexp::MULTILINE)
76
- $double_square_bracket_regex = Regexp.new("(#{Regexp.escape('[[')}|#{Regexp.escape(']]')})", Regexp::MULTILINE)
77
- $single_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{')}|#{Regexp.escape('}')})", Regexp::MULTILINE)
78
- $double_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{{')}|#{Regexp.escape('}}')})", Regexp::MULTILINE)
79
- $curly_square_bracket_regex = Regexp.new("(#{Regexp.escape('{|')}|#{Regexp.escape('|}')})", Regexp::MULTILINE)
80
-
81
- $complex_regex_01 = Regexp.new('\<\<([^<>]++)\>\>\s?')
82
- $complex_regex_02 = Regexp.new('\[\[File\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]', Regexp::MULTILINE | Regexp::IGNORECASE)
83
- $complex_regex_03 = Regexp.new('^\[\[((?:[^\[\]]++|\[\[\g<1>\]\])++)^\]\]', Regexp::MULTILINE)
84
- $complex_regex_04 = Regexp.new('\{\{(?:infobox|efn|sfn|unreliable source|refn|reflist|col(?:umns)?\-list|div col|no col|bar box|formatnum\:|col\||see also\||r\||#)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
85
- $complex_regex_05 = Regexp.new('\{\{[^{}]+?\n\|((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
86
-
87
- $cleanup_regex_01 = Regexp.new('\[ref\]\s*\[\/ref\]', Regexp::MULTILINE)
88
- $cleanup_regex_02 = Regexp.new('^File:.+$')
89
- $cleanup_regex_03 = Regexp.new('^\|.*$')
90
- $cleanup_regex_04 = Regexp.new('\{\{.*$')
91
- $cleanup_regex_05 = Regexp.new('^.*\}\}')
92
- $cleanup_regex_06 = Regexp.new('\{\|.*$')
93
- $cleanup_regex_07 = Regexp.new('^.*\|\}')
94
- $cleanup_regex_08 = Regexp.new('\n\n\n+', Regexp::MULTILINE)
95
-
96
- ###################################################
1
+ # frozen_string_literal: true
97
2
 
98
- module Wp2txt
3
+ require "strscan"
4
+ require "find"
5
+ require_relative "regex"
99
6
 
100
- def convert_characters!(text, has_retried = false)
101
- begin
102
- text << ""
103
- chrref_to_utf!(text)
104
- special_chr!(text)
105
-
106
- rescue # detect invalid byte sequence in UTF-8
107
- if has_retried
108
- puts "invalid byte sequence detected"
109
- puts "******************************"
110
- File.open("error_log.txt", "w") do |f|
111
- f.write text
112
- end
113
- exit
114
- else
115
- text.encode!("UTF-16")
116
- text.encode!("UTF-8")
117
- convert_characters!(text, true)
7
+ module Wp2txt
8
+ def convert_characters(text, has_retried = false)
9
+ text << ""
10
+ text = chrref_to_utf(text)
11
+ text = special_chr(text)
12
+ text = text.encode("UTF-8", "UTF-8", invalid: :replace, replace: "")
13
+ rescue StandardError # detect invalid byte sequence in UTF-8
14
+ if has_retried
15
+ puts "invalid byte sequence detected"
16
+ puts "******************************"
17
+ File.open("error_log.txt", "w") do |f|
18
+ f.write text
118
19
  end
20
+ exit
21
+ else
22
+ text = text.encode("UTF-16", "UTF-16", invalid: :replace, replace: "")
23
+ text = text.encode("UTF-16", "UTF-16", invalid: :replace, replace: "")
24
+ convert_characters(text, true)
119
25
  end
120
26
  end
121
-
122
- def format_wiki!(text, has_retried = false)
123
- remove_complex!(text)
124
-
125
- escape_nowiki!(text)
126
- process_interwiki_links!(text)
127
- process_external_links!(text)
128
- unescape_nowiki!(text)
129
- remove_directive!(text)
130
- remove_emphasis!(text)
131
- mndash!(text)
132
- remove_hr!(text)
133
- remove_tag!(text)
134
- correct_inline_template!(text) unless $leave_inline_template
135
- remove_templates!(text) unless $leave_inline_template
136
- remove_table!(text) unless $leave_table
27
+
28
+ def format_wiki(text, config = {})
29
+ text = remove_complex(text)
30
+ text = escape_nowiki(text)
31
+ text = process_interwiki_links(text)
32
+ text = process_external_links(text)
33
+ text = unescape_nowiki(text)
34
+ text = remove_directive(text)
35
+ text = remove_emphasis(text)
36
+ text = mndash(text)
37
+ text = remove_hr(text)
38
+ text = remove_tag(text)
39
+ text = correct_inline_template(text) unless config[:inline]
40
+ text = remove_templates(text) unless config[:inline]
41
+ text = remove_table(text) unless config[:table]
42
+ text
137
43
  end
138
-
139
- def cleanup!(text)
140
- text.gsub!($cleanup_regex_01){""}
141
- text.gsub!($cleanup_regex_02){""}
142
- text.gsub!($cleanup_regex_03){""}
143
- text.gsub!($cleanup_regex_04){""}
144
- text.gsub!($cleanup_regex_05){""}
145
- text.gsub!($cleanup_regex_06){""}
146
- text.gsub!($cleanup_regex_07){""}
147
- text.gsub!($cleanup_regex_08){"\n\n"}
148
- text.strip!
44
+
45
+ def cleanup(text)
46
+ text = text.gsub(CLEANUP_REGEX_01) { "" }
47
+ text = text.gsub(CLEANUP_REGEX_02) { "" }
48
+ text = text.gsub(CLEANUP_REGEX_03) { "" }
49
+ text = text.gsub(CLEANUP_REGEX_04) { "" }
50
+ text = text.gsub(CLEANUP_REGEX_05) { "" }
51
+ text = text.gsub(CLEANUP_REGEX_06) { "" }
52
+ text = text.gsub(CLEANUP_REGEX_07) { "" }
53
+ text = text.gsub(CLEANUP_REGEX_08) { "\n\n" }
54
+ text = text.strip
149
55
  text << "\n\n"
150
56
  end
151
57
 
152
58
  #################### parser for nested structure ####################
153
-
59
+
154
60
  def process_nested_structure(scanner, left, right, &block)
155
- test = false
156
- buffer = ""
61
+ buffer = +""
157
62
  begin
158
- if left == "[" && right == "]"
159
- regex = $single_square_bracket_regex
160
- elsif left == "[[" && right == "]]"
161
- regex = $double_square_bracket_regex
162
- elsif left == "{" && right == "}"
163
- regex = $single_curly_bracket_regex
164
- elsif left == "{{" && right == "}}"
165
- regex = $double_curly_bracket_regex
166
- elsif left == "{|" && right == "|}"
167
- regex = $curly_square_bracket_regex
168
- else
169
- regex = Regexp.new("(#{Regexp.escape(left)}|#{Regexp.escape(right)})")
170
- end
171
- while str = scanner.scan_until(regex)
63
+ regex = if left == "[" && right == "]"
64
+ SINGLE_SQUARE_BRACKET_REGEX
65
+ elsif left == "[[" && right == "]]"
66
+ DOUBLE_SQUARE_BRACKET_REGEX
67
+ elsif left == "{" && right == "}"
68
+ SINGLE_CURLY_BRACKET_REGEX
69
+ elsif left == "{{" && right == "}}"
70
+ DOUBLE_CURLY_BRACKET_REGEX
71
+ elsif left == "{|" && right == "|}"
72
+ CURLY_SQUARE_BRACKET_REGEX
73
+ else
74
+ Regexp.new("(#{Regexp.escape(left)}|#{Regexp.escape(right)})")
75
+ end
76
+ while (str = scanner.scan_until(regex))
172
77
  case scanner[1]
173
78
  when left
174
79
  buffer << str
175
80
  has_left = true
176
81
  when right
177
82
  if has_left
178
- buffer = buffer[0...-(left.size)]
179
- contents = block.call(str[0...-(left.size)])
83
+ buffer = buffer[0...-left.size]
84
+ contents = block.call(str[0...-left.size])
180
85
  buffer << contents
181
86
  break
182
87
  else
@@ -186,25 +91,23 @@ module Wp2txt
186
91
  end
187
92
  buffer << scanner.rest
188
93
 
189
- if buffer == scanner.string
190
- return buffer
191
- else
192
- scanner.string = buffer
193
- return process_nested_structure(scanner, left, right, &block) || ""
194
- end
195
- rescue => e
196
- return scanner.string
94
+ return buffer if buffer == scanner.string
95
+
96
+ scanner.string = buffer
97
+ process_nested_structure(scanner, left, right, &block) || ""
98
+ rescue StandardError
99
+ scanner.string
197
100
  end
198
- end
101
+ end
199
102
 
200
103
  #################### methods used from format_wiki ####################
201
- def escape_nowiki!(str)
104
+ def escape_nowiki(str)
202
105
  if @nowikis
203
106
  @nowikis.clear
204
107
  else
205
108
  @nowikis = {}
206
109
  end
207
- str.gsub!($escape_nowiki_regex) do
110
+ str.gsub(ESCAPE_NOWIKI_REGEX) do
208
111
  nowiki = $1
209
112
  nowiki_id = nowiki.object_id
210
113
  @nowikis[nowiki_id] = nowiki
@@ -212,17 +115,17 @@ module Wp2txt
212
115
  end
213
116
  end
214
117
 
215
- def unescape_nowiki!(str)
216
- str.gsub!($unescape_nowiki_regex) do
118
+ def unescape_nowiki(str)
119
+ str.gsub(UNESCAPE_NOWIKI_REGEX) do
217
120
  obj_id = $1.to_i
218
121
  @nowikis[obj_id]
219
122
  end
220
123
  end
221
-
222
- def process_interwiki_links!(str)
124
+
125
+ def process_interwiki_links(str)
223
126
  scanner = StringScanner.new(str)
224
- result = process_nested_structure(scanner, "[[", "]]") do |contents|
225
- parts = contents.split("|")
127
+ process_nested_structure(scanner, "[[", "]]") do |contents|
128
+ parts = contents.split("|")
226
129
  case parts.size
227
130
  when 1
228
131
  parts.first || ""
@@ -231,12 +134,11 @@ module Wp2txt
231
134
  parts.join("|")
232
135
  end
233
136
  end
234
- str.replace(result)
235
137
  end
236
138
 
237
- def process_external_links!(str)
139
+ def process_external_links(str)
238
140
  scanner = StringScanner.new(str)
239
- result = process_nested_structure(scanner, "[", "]") do |contents|
141
+ process_nested_structure(scanner, "[", "]") do |contents|
240
142
  if /\A\s.+\s\z/ =~ contents
241
143
  " (#{contents.strip}) "
242
144
  else
@@ -249,119 +151,115 @@ module Wp2txt
249
151
  end
250
152
  end
251
153
  end
252
- str.replace(result)
253
154
  end
254
155
 
255
156
  #################### methods used from format_article ####################
256
157
 
257
- def remove_templates!(str)
258
- scanner = StringScanner.new(str)
259
- result = process_nested_structure(scanner, "{{", "}}") do |contents|
158
+ def remove_templates(str)
159
+ scanner1 = StringScanner.new(str)
160
+ result = process_nested_structure(scanner1, "{{", "}}") do
260
161
  ""
261
162
  end
262
- scanner = StringScanner.new(result)
263
- result = process_nested_structure(scanner, "{", "}") do |contents|
163
+ scanner2 = StringScanner.new(result)
164
+ process_nested_structure(scanner2, "{", "}") do
264
165
  ""
265
166
  end
266
- str.replace(result)
267
167
  end
268
-
269
- def remove_table!(str)
168
+
169
+ def remove_table(str)
270
170
  scanner = StringScanner.new(str)
271
- result = process_nested_structure(scanner, "{|", "|}") do |contents|
171
+ process_nested_structure(scanner, "{|", "|}") do
272
172
  ""
273
173
  end
274
- str.replace(result)
275
174
  end
276
-
277
- def special_chr!(str)
278
- str.replace $html_decoder.decode(str)
175
+
176
+ def special_chr(str)
177
+ HTML_DECODER.decode(str)
279
178
  end
280
179
 
281
- def remove_inbetween!(str, tagset = ['<', '>'])
180
+ def remove_inbetween(str, tagset = ["<", ">"])
282
181
  tagsets = Regexp.quote(tagset.uniq.join(""))
283
182
  regex = /#{Regexp.escape(tagset[0])}[^#{tagsets}]*#{Regexp.escape(tagset[1])}/
284
- str.gsub!(regex, "")
183
+ str.gsub(regex, "")
285
184
  end
286
185
 
287
- def remove_tag!(str)
288
- str.gsub!($remove_tag_regex, "")
186
+ def remove_tag(str)
187
+ str.gsub(REMOVE_TAG_REGEX, "")
289
188
  end
290
189
 
291
- def remove_directive!(str)
292
- str.gsub!($remove_directives_regex, "")
190
+ def remove_directive(str)
191
+ str.gsub(REMOVE_DIRECTIVES_REGEX, "")
293
192
  end
294
193
 
295
- def remove_emphasis!(str)
296
- str.gsub!($remove_emphasis_regex) do
194
+ def remove_emphasis(str)
195
+ str.gsub(REMOVE_EMPHASIS_REGEX) do
297
196
  $2
298
197
  end
299
198
  end
300
199
 
301
- def chrref_to_utf!(num_str)
302
- begin
303
- num_str.gsub!($chrref_to_utf_regex) do
304
- if $1 == 'x'
305
- ch = $2.to_i(16)
306
- else
307
- ch = $2.to_i
308
- end
309
- hi = ch>>8
310
- lo = ch&0xff
311
- u = "\377\376" << lo.chr << hi.chr
312
- u.encode("UTF-8", "UTF-16")
313
- end
314
- rescue StandardError
315
- return nil
200
+ def chrref_to_utf(num_str)
201
+ num_str.gsub(CHRREF_TO_UTF_REGEX) do
202
+ ch = if $1 == "x"
203
+ $2.to_i(16)
204
+ else
205
+ $2.to_i
206
+ end
207
+ hi = ch >> 8
208
+ lo = ch & 0xff
209
+ u = +"\377\376" << lo.chr << hi.chr
210
+ u.encode("UTF-8", "UTF-16")
316
211
  end
317
- return true
212
+ rescue StandardError
213
+ num_str
318
214
  end
319
-
320
- def mndash!(str)
321
- str.gsub!($mndash_regex, "–")
215
+
216
+ def mndash(str)
217
+ str.gsub(MNDASH_REGEX, "–")
322
218
  end
323
219
 
324
- def remove_hr!(str)
325
- str.gsub!($remove_hr_regex, "")
220
+ def remove_hr(str)
221
+ str.gsub(REMOVE_HR_REGEX, "")
326
222
  end
327
223
 
328
- def remove_ref!(str)
329
- str.gsub!($format_ref_regex){""}
224
+ def remove_ref(str)
225
+ str.gsub(FORMAT_REF_REGEX) { "" }
330
226
  end
331
227
 
332
- def remove_html!(str)
333
- str.gsub!(/<[^<>]+\/>/){""}
228
+ def remove_html(str)
229
+ res = +str.dup
230
+ res.gsub!(%r{<[^<>]+/>}) { "" }
334
231
  ["div", "gallery", "timeline", "noinclude"].each do |tag|
335
- scanner = StringScanner.new(str)
336
- result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do |contents|
232
+ scanner = StringScanner.new(res)
233
+ result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do
337
234
  ""
338
235
  end
339
- str.replace(result)
236
+ res.replace(result)
340
237
  end
238
+ res
341
239
  end
342
240
 
343
- def remove_complex!(str)
344
- str.gsub!($complex_regex_01){"《#{$1}》"}
345
- str.gsub!($complex_regex_02){""}
346
- str.gsub!($complex_regex_03){""}
347
- str.gsub!($complex_regex_04){""}
348
- str.gsub!($complex_regex_05){""}
241
+ def remove_complex(str)
242
+ str = str.gsub(COMPLEX_REGEX_01) { "《#{$1}》" }
243
+ str = str.gsub(COMPLEX_REGEX_02) { "" }
244
+ str = str.gsub(COMPLEX_REGEX_03) { "" }
245
+ str = str.gsub(COMPLEX_REGEX_04) { "" }
246
+ str.gsub(COMPLEX_REGEX_05) { "" }
349
247
  end
350
-
351
- def make_reference!(str)
352
- str.gsub!($make_reference_regex_a){"\n"}
353
- str.gsub!($make_reference_regex_b){""}
354
- str.gsub!($make_reference_regex_c){"[ref]"}
355
- str.gsub!($make_reference_regex_d){"[/ref]"}
248
+
249
+ def make_reference(str)
250
+ str = str.gsub(MAKE_REFERENCE_REGEX_A) { "\n" }
251
+ str = str.gsub(MAKE_REFERENCE_REGEX_B) { "" }
252
+ str = str.gsub(MAKE_REFERENCE_REGEX_C) { "[ref]" }
253
+ str.gsub(MAKE_REFERENCE_REGEX_D) { "[/ref]" }
356
254
  end
357
255
 
358
- def correct_inline_template!(str)
256
+ def correct_inline_template(str)
359
257
  scanner = StringScanner.new(str)
360
- result = process_nested_structure(scanner, "{{", "}}") do |contents|
258
+ process_nested_structure(scanner, "{{", "}}") do |contents|
361
259
  parts = contents.split("|")
362
260
  if /\A(?:lang|fontsize)\z/i =~ parts[0]
363
261
  parts.shift
364
- elsif /\Alang\-/i =~ parts[0]
262
+ elsif /\Alang-/i =~ parts[0]
365
263
  parts.shift
366
264
  elsif /\Alang=/i =~ parts[1]
367
265
  parts.shift
@@ -372,27 +270,25 @@ module Wp2txt
372
270
  else
373
271
  begin
374
272
  keyval = parts[1].split("=")
375
- if keyval.size > 1
376
- out = keyval[1]
377
- else
378
- out = parts[1] || ""
379
- end
380
- rescue
273
+ out = if keyval.size > 1
274
+ keyval[1]
275
+ else
276
+ parts[1] || ""
277
+ end
278
+ rescue StandardError
381
279
  out = parts[1] || ""
382
280
  end
383
281
  end
384
-
385
282
  out.strip
386
283
  end
387
- str.replace result
388
284
  end
389
285
 
390
- #################### file related utilities ####################
286
+ #################### file related utilities ####################
391
287
 
392
288
  # collect filenames recursively
393
289
  def collect_files(str, regex = nil)
394
290
  regex ||= //
395
- text_array = Array.new
291
+ text_array = []
396
292
  Find.find(str) do |f|
397
293
  text_array << f if regex =~ f
398
294
  end
@@ -400,11 +296,11 @@ module Wp2txt
400
296
  end
401
297
 
402
298
  # modify a file using block/yield mechanism
403
- def file_mod(file_path, backup = false, &block)
299
+ def file_mod(file_path, backup = false)
404
300
  File.open(file_path, "r") do |fr|
405
301
  str = fr.read
406
302
  newstr = yield(str)
407
- str = newstr unless newstr == nil
303
+ str = newstr if nil? newstr
408
304
  File.open("temp", "w") do |tf|
409
305
  tf.write(str)
410
306
  end
@@ -413,54 +309,50 @@ module Wp2txt
413
309
  File.rename(file_path, file_path + ".bak")
414
310
  File.rename("temp", file_path)
415
311
  File.unlink(file_path + ".bak") unless backup
416
- end
312
+ end
417
313
 
418
314
  # modify files under a directry (recursive)
419
- def batch_file_mod(dir_path, &block)
315
+ def batch_file_mod(dir_path)
420
316
  if FileTest.directory?(dir_path)
421
317
  collect_files(dir_path).each do |file|
422
318
  yield file if FileTest.file?(file)
423
319
  end
424
- else
425
- yield dir_path if FileTest.file?(dir_path)
320
+ elsif FileTest.file?(dir_path)
321
+ yield dir_path
426
322
  end
427
323
  end
428
324
 
429
325
  # take care of difference of separators among environments
430
326
  def correct_separator(input)
431
- if input.is_a?(String)
432
- ret_str = String.new
327
+ case input
328
+ when String
433
329
  if RUBY_PLATFORM.index("win32")
434
- ret_str = input.gsub("/", "\\")
330
+ input.gsub("/", "\\")
435
331
  else
436
- ret_str = input.gsub("\\", "/")
332
+ input.gsub("\\", "/")
437
333
  end
438
- return ret_str
439
- elsif input.is_a?(Array)
440
- ret_array = Array.new
334
+ when Array
335
+ ret_array = []
441
336
  input.each do |item|
442
337
  ret_array << correct_separator(item)
443
338
  end
444
- return ret_array
339
+ ret_array
445
340
  end
446
341
  end
447
342
 
448
- def rename(files, ext = "txt")
343
+ def rename(files, ext = "txt")
449
344
  # num of digits necessary to name the last file generated
450
- maxwidth = 0
345
+ maxwidth = 0
451
346
 
452
347
  files.each do |f|
453
- width = f.slice(/\-(\d+)\z/, 1).to_s.length.to_i
348
+ width = f.slice(/-(\d+)\z/, 1).to_s.length.to_i
454
349
  maxwidth = width if maxwidth < width
455
- end
456
-
457
- files.each do |f|
458
- newname= f.sub(/\-(\d+)\z/) do
459
- "-" + sprintf("%0#{maxwidth}d", $1.to_i)
350
+ newname = f.sub(/-(\d+)\z/) do
351
+ "-" + format("%0#{maxwidth}d", $1.to_i)
460
352
  end
461
353
  File.rename(f, newname + ".#{ext}")
462
354
  end
463
- return true
355
+ true
464
356
  end
465
357
 
466
358
  # convert int of seconds to string in the format 00:00:00
@@ -472,8 +364,6 @@ module Wp2txt
472
364
  h = int / 3600
473
365
  m = (int - h * 3600) / 60
474
366
  s = int % 60
475
- str = sprintf("%02d:%02d:%02d", h, m, s)
476
- return str
367
+ format("%02d:%02d:%02d", h, m, s)
477
368
  end
478
-
479
369
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Wp2txt
2
- VERSION = "1.0.1"
4
+ VERSION = "1.1.0"
3
5
  end