wp2txt 1.0.2 → 1.1.1

Sign up to get free protection for your applications and to get access to all the features.
data/lib/wp2txt/utils.rb CHANGED
@@ -1,183 +1,87 @@
1
- #!/usr/bin/env ruby
2
- # -*- coding: utf-8 -*-
3
-
4
- require 'strscan'
5
- require 'find'
6
- require 'htmlentities'
7
-
8
- ###################################################
9
- # global variables to save resource for generating regexps
10
- # those with a trailing number 1 represent opening tag/markup
11
- # those with a trailing number 2 represent closing tag/markup
12
- # those without a trailing number contain both opening/closing tags/markups
13
-
14
- $html_decoder = HTMLEntities.new
15
-
16
- $entities = ['&nbsp;', '&lt;', '&gt;', '&amp;', '&quot;'].zip([' ', '<', '>', '&', '"'])
17
- $html_hash = Hash[*$entities.flatten]
18
- $html_regex = Regexp.new("(" + $html_hash.keys.join("|") + ")")
19
- $ml_template_onset_regex = Regexp.new('^\{\{[^\}]*$')
20
- $ml_template_end_regex = Regexp.new('\}\}\s*$')
21
- $ml_link_onset_regex = Regexp.new('^\[\[[^\]]*$')
22
- $ml_linkend_regex = Regexp.new('\]\]\s*$')
23
- $isolated_template_regex = Regexp.new('^\s*\{\{.+\}\}\s*$')
24
- $isolated_tag_regex = Regexp.new('^\s*\<[^\<\>]+\>.+\<[^\<\>]+\>\s*$')
25
- $in_link_regex = Regexp.new('^\s*\[.*\]\s*$')
26
- $in_inputbox_regex = Regexp.new('<inputbox>.*?<\/inputbox>')
27
- $in_inputbox_regex1 = Regexp.new('<inputbox>')
28
- $in_inputbox_regex2 = Regexp.new('<\/inputbox>')
29
- $in_source_regex = Regexp.new('<source.*?>.*?<\/source>')
30
- $in_source_regex1 = Regexp.new('<source.*?>')
31
- $in_source_regex2 = Regexp.new('<\/source>')
32
- $in_math_regex = Regexp.new('<math.*?>.*?<\/math>')
33
- $in_math_regex1 = Regexp.new('<math.*?>')
34
- $in_math_regex2 = Regexp.new('<\/math>')
35
- $in_heading_regex = Regexp.new('^=+.*?=+$')
36
- $in_html_table_regex = Regexp.new('<table.*?><\/table>')
37
- $in_html_table_regex1 = Regexp.new('<table\b')
38
- $in_html_table_regex2 = Regexp.new('<\/\s*table>')
39
- $in_table_regex1 = Regexp.new('^\s*\{\|')
40
- $in_table_regex2 = Regexp.new('^\|\}.*?$')
41
- $in_unordered_regex = Regexp.new('^\*')
42
- $in_ordered_regex = Regexp.new('^\#')
43
- $in_pre_regex = Regexp.new('^ ')
44
- $in_definition_regex = Regexp.new('^[\;\:]')
45
- $blank_line_regex = Regexp.new('^\s*$')
46
- $redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
47
- $remove_tag_regex = Regexp.new("\<[^\<\>]*\>")
48
- $remove_directives_regex = Regexp.new("\_\_[^\_]*\_\_")
49
- $remove_emphasis_regex = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
50
- $chrref_to_utf_regex = Regexp.new('&#(x?)([0-9a-fA-F]+);')
51
- $mndash_regex = Regexp.new('\{(mdash|ndash|–)\}')
52
- $remove_hr_regex = Regexp.new('^\s*\-+\s*$')
53
- $make_reference_regex_a = Regexp.new('<br ?\/>')
54
- $make_reference_regex_b = Regexp.new('<ref[^>]*\/>')
55
- $make_reference_regex_c = Regexp.new('<ref[^>]*>')
56
- $make_reference_regex_d = Regexp.new('<\/ref>')
57
- $format_ref_regex = Regexp.new('\[ref\](.*?)\[\/ref\]', Regexp::MULTILINE)
58
- $heading_onset_regex = Regexp.new('^(\=+)\s+')
59
- $heading_coda_regex = Regexp.new('\s+(\=+)$')
60
- $list_marks_regex = Regexp.new('\A[\*\#\;\:\ ]+')
61
- $pre_marks_regex = Regexp.new('\A\^\ ')
62
- $def_marks_regex = Regexp.new('\A[\;\:\ ]+')
63
- $onset_bar_regex = Regexp.new('\A[^\|]+\z')
64
-
65
- $category_patterns = ["Category", "Categoria"].join("|")
66
- $category_regex = Regexp.new('[\{\[\|\b](?:' + $category_patterns + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
67
-
68
- $escape_nowiki_regex = Regexp.new('<nowiki>(.*?)<\/nowiki>', Regexp::MULTILINE)
69
- $unescape_nowiki_regex = Regexp.new('<nowiki\-(\d+?)>')
70
-
71
- $remove_isolated_regex = Regexp.new('^\s*\{\{(.*?)\}\}\s*$')
72
- $remove_inline_regex = Regexp.new('\{\{(.*?)\}\}')
73
- $type_code_regex = Regexp.new('\A(?:lang*|\AIPA|IEP|SEP|indent|audio|small|dmoz|pron|unicode|note label|nowrap|ArabDIN|trans|Nihongo|Polytonic)', Regexp::IGNORECASE)
74
-
75
- $single_square_bracket_regex = Regexp.new("(#{Regexp.escape('[')}|#{Regexp.escape(']')})", Regexp::MULTILINE)
76
- $double_square_bracket_regex = Regexp.new("(#{Regexp.escape('[[')}|#{Regexp.escape(']]')})", Regexp::MULTILINE)
77
- $single_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{')}|#{Regexp.escape('}')})", Regexp::MULTILINE)
78
- $double_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{{')}|#{Regexp.escape('}}')})", Regexp::MULTILINE)
79
- $curly_square_bracket_regex = Regexp.new("(#{Regexp.escape('{|')}|#{Regexp.escape('|}')})", Regexp::MULTILINE)
80
-
81
- $complex_regex_01 = Regexp.new('\<\<([^<>]++)\>\>\s?')
82
- $complex_regex_02 = Regexp.new('\[\[File\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]', Regexp::MULTILINE | Regexp::IGNORECASE)
83
- $complex_regex_03 = Regexp.new('^\[\[((?:[^\[\]]++|\[\[\g<1>\]\])++)^\]\]', Regexp::MULTILINE)
84
- $complex_regex_04 = Regexp.new('\{\{(?:infobox|efn|sfn|unreliable source|refn|reflist|col(?:umns)?\-list|div col|no col|bar box|formatnum\:|col\||see also\||r\||#)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
85
- $complex_regex_05 = Regexp.new('\{\{[^{}]+?\n\|((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
86
-
87
- $cleanup_regex_01 = Regexp.new('\[ref\]\s*\[\/ref\]', Regexp::MULTILINE)
88
- $cleanup_regex_02 = Regexp.new('^File:.+$')
89
- $cleanup_regex_03 = Regexp.new('^\|.*$')
90
- $cleanup_regex_04 = Regexp.new('\{\{.*$')
91
- $cleanup_regex_05 = Regexp.new('^.*\}\}')
92
- $cleanup_regex_06 = Regexp.new('\{\|.*$')
93
- $cleanup_regex_07 = Regexp.new('^.*\|\}')
94
- $cleanup_regex_08 = Regexp.new('\n\n\n+', Regexp::MULTILINE)
95
-
96
- ###################################################
1
+ # frozen_string_literal: true
97
2
 
98
- module Wp2txt
3
+ require "strscan"
4
+ require "find"
5
+ require_relative "regex"
99
6
 
100
- def convert_characters!(text, has_retried = false)
101
- begin
102
- text << ""
103
- chrref_to_utf!(text)
104
- special_chr!(text)
105
- text.encode!("UTF-8", "UTF-8", invalid: :replace, replace: "")
106
-
107
- rescue # detect invalid byte sequence in UTF-8
108
- if has_retried
109
- puts "invalid byte sequence detected"
110
- puts "******************************"
111
- File.open("error_log.txt", "w") do |f|
112
- f.write text
113
- end
114
- exit
115
- else
116
- text.encode!("UTF-16", "UTF-16", invalid: :replace, replace: "")
117
- text.encode!("UTF-16", "UTF-16", invalid: :replace, replace: "")
118
- convert_characters!(text, true)
7
+ module Wp2txt
8
+ def convert_characters(text, has_retried = false)
9
+ text << ""
10
+ text = chrref_to_utf(text)
11
+ text = special_chr(text)
12
+ text = text.encode("UTF-8", "UTF-8", invalid: :replace, replace: "")
13
+ rescue StandardError # detect invalid byte sequence in UTF-8
14
+ if has_retried
15
+ puts "invalid byte sequence detected"
16
+ puts "******************************"
17
+ File.open("error_log.txt", "w") do |f|
18
+ f.write text
119
19
  end
20
+ exit
21
+ else
22
+ text = text.encode("UTF-16", "UTF-16", invalid: :replace, replace: "")
23
+ text = text.encode("UTF-16", "UTF-16", invalid: :replace, replace: "")
24
+ convert_characters(text, true)
120
25
  end
121
26
  end
122
27
 
123
- def format_wiki!(text, has_retried = false)
124
- remove_complex!(text)
125
-
126
- escape_nowiki!(text)
127
- process_interwiki_links!(text)
128
- process_external_links!(text)
129
- unescape_nowiki!(text)
130
- remove_directive!(text)
131
- remove_emphasis!(text)
132
- mndash!(text)
133
- remove_hr!(text)
134
- remove_tag!(text)
135
- correct_inline_template!(text) unless $leave_inline_template
136
- remove_templates!(text) unless $leave_inline_template
137
- remove_table!(text) unless $leave_table
28
+ def format_wiki(text, config = {})
29
+ text = remove_complex(text)
30
+ text = escape_nowiki(text)
31
+ text = process_interwiki_links(text)
32
+ text = process_external_links(text)
33
+ text = unescape_nowiki(text)
34
+ text = remove_directive(text)
35
+ text = remove_emphasis(text)
36
+ text = mndash(text)
37
+ text = remove_hr(text)
38
+ text = remove_tag(text)
39
+ text = correct_inline_template(text) unless config[:inline]
40
+ text = remove_templates(text) unless config[:inline]
41
+ text = remove_table(text) unless config[:table]
42
+ text
138
43
  end
139
44
 
140
- def cleanup!(text)
141
- text.gsub!($cleanup_regex_01){""}
142
- text.gsub!($cleanup_regex_02){""}
143
- text.gsub!($cleanup_regex_03){""}
144
- text.gsub!($cleanup_regex_04){""}
145
- text.gsub!($cleanup_regex_05){""}
146
- text.gsub!($cleanup_regex_06){""}
147
- text.gsub!($cleanup_regex_07){""}
148
- text.gsub!($cleanup_regex_08){"\n\n"}
149
- text.strip!
45
+ def cleanup(text)
46
+ text = text.gsub(CLEANUP_REGEX_01) { "" }
47
+ text = text.gsub(CLEANUP_REGEX_02) { "" }
48
+ text = text.gsub(CLEANUP_REGEX_03) { "" }
49
+ text = text.gsub(CLEANUP_REGEX_04) { "" }
50
+ text = text.gsub(CLEANUP_REGEX_05) { "" }
51
+ text = text.gsub(CLEANUP_REGEX_06) { "" }
52
+ text = text.gsub(CLEANUP_REGEX_07) { "" }
53
+ text = text.gsub(CLEANUP_REGEX_08) { "\n\n" }
54
+ text = text.strip
150
55
  text << "\n\n"
151
56
  end
152
57
 
153
58
  #################### parser for nested structure ####################
154
59
 
155
60
  def process_nested_structure(scanner, left, right, &block)
156
- test = false
157
- buffer = ""
61
+ buffer = +""
158
62
  begin
159
- if left == "[" && right == "]"
160
- regex = $single_square_bracket_regex
161
- elsif left == "[[" && right == "]]"
162
- regex = $double_square_bracket_regex
163
- elsif left == "{" && right == "}"
164
- regex = $single_curly_bracket_regex
165
- elsif left == "{{" && right == "}}"
166
- regex = $double_curly_bracket_regex
167
- elsif left == "{|" && right == "|}"
168
- regex = $curly_square_bracket_regex
169
- else
170
- regex = Regexp.new("(#{Regexp.escape(left)}|#{Regexp.escape(right)})")
171
- end
172
- while str = scanner.scan_until(regex)
63
+ regex = if left == "[" && right == "]"
64
+ SINGLE_SQUARE_BRACKET_REGEX
65
+ elsif left == "[[" && right == "]]"
66
+ DOUBLE_SQUARE_BRACKET_REGEX
67
+ elsif left == "{" && right == "}"
68
+ SINGLE_CURLY_BRACKET_REGEX
69
+ elsif left == "{{" && right == "}}"
70
+ DOUBLE_CURLY_BRACKET_REGEX
71
+ elsif left == "{|" && right == "|}"
72
+ CURLY_SQUARE_BRACKET_REGEX
73
+ else
74
+ Regexp.new("(#{Regexp.escape(left)}|#{Regexp.escape(right)})")
75
+ end
76
+ while (str = scanner.scan_until(regex))
173
77
  case scanner[1]
174
78
  when left
175
79
  buffer << str
176
80
  has_left = true
177
81
  when right
178
82
  if has_left
179
- buffer = buffer[0...-(left.size)]
180
- contents = block.call(str[0...-(left.size)])
83
+ buffer = buffer[0...-left.size]
84
+ contents = block.call(str[0...-left.size])
181
85
  buffer << contents
182
86
  break
183
87
  else
@@ -187,25 +91,23 @@ module Wp2txt
187
91
  end
188
92
  buffer << scanner.rest
189
93
 
190
- if buffer == scanner.string
191
- return buffer
192
- else
193
- scanner.string = buffer
194
- return process_nested_structure(scanner, left, right, &block) || ""
195
- end
196
- rescue => e
197
- return scanner.string
94
+ return buffer if buffer == scanner.string
95
+
96
+ scanner.string = buffer
97
+ process_nested_structure(scanner, left, right, &block) || ""
98
+ rescue StandardError
99
+ scanner.string
198
100
  end
199
101
  end
200
102
 
201
103
  #################### methods used from format_wiki ####################
202
- def escape_nowiki!(str)
104
+ def escape_nowiki(str)
203
105
  if @nowikis
204
106
  @nowikis.clear
205
107
  else
206
108
  @nowikis = {}
207
109
  end
208
- str.gsub!($escape_nowiki_regex) do
110
+ str.gsub(ESCAPE_NOWIKI_REGEX) do
209
111
  nowiki = $1
210
112
  nowiki_id = nowiki.object_id
211
113
  @nowikis[nowiki_id] = nowiki
@@ -213,16 +115,16 @@ module Wp2txt
213
115
  end
214
116
  end
215
117
 
216
- def unescape_nowiki!(str)
217
- str.gsub!($unescape_nowiki_regex) do
118
+ def unescape_nowiki(str)
119
+ str.gsub(UNESCAPE_NOWIKI_REGEX) do
218
120
  obj_id = $1.to_i
219
121
  @nowikis[obj_id]
220
122
  end
221
123
  end
222
124
 
223
- def process_interwiki_links!(str)
125
+ def process_interwiki_links(str)
224
126
  scanner = StringScanner.new(str)
225
- result = process_nested_structure(scanner, "[[", "]]") do |contents|
127
+ process_nested_structure(scanner, "[[", "]]") do |contents|
226
128
  parts = contents.split("|")
227
129
  case parts.size
228
130
  when 1
@@ -232,12 +134,11 @@ module Wp2txt
232
134
  parts.join("|")
233
135
  end
234
136
  end
235
- str.replace(result)
236
137
  end
237
138
 
238
- def process_external_links!(str)
139
+ def process_external_links(str)
239
140
  scanner = StringScanner.new(str)
240
- result = process_nested_structure(scanner, "[", "]") do |contents|
141
+ process_nested_structure(scanner, "[", "]") do |contents|
241
142
  if /\A\s.+\s\z/ =~ contents
242
143
  " (#{contents.strip}) "
243
144
  else
@@ -250,119 +151,115 @@ module Wp2txt
250
151
  end
251
152
  end
252
153
  end
253
- str.replace(result)
254
154
  end
255
155
 
256
156
  #################### methods used from format_article ####################
257
157
 
258
- def remove_templates!(str)
259
- scanner = StringScanner.new(str)
260
- result = process_nested_structure(scanner, "{{", "}}") do |contents|
158
+ def remove_templates(str)
159
+ scanner1 = StringScanner.new(str)
160
+ result = process_nested_structure(scanner1, "{{", "}}") do
261
161
  ""
262
162
  end
263
- scanner = StringScanner.new(result)
264
- result = process_nested_structure(scanner, "{", "}") do |contents|
163
+ scanner2 = StringScanner.new(result)
164
+ process_nested_structure(scanner2, "{", "}") do
265
165
  ""
266
166
  end
267
- str.replace(result)
268
167
  end
269
168
 
270
- def remove_table!(str)
169
+ def remove_table(str)
271
170
  scanner = StringScanner.new(str)
272
- result = process_nested_structure(scanner, "{|", "|}") do |contents|
171
+ process_nested_structure(scanner, "{|", "|}") do
273
172
  ""
274
173
  end
275
- str.replace(result)
276
174
  end
277
175
 
278
- def special_chr!(str)
279
- str.replace $html_decoder.decode(str)
176
+ def special_chr(str)
177
+ HTML_DECODER.decode(str)
280
178
  end
281
179
 
282
- def remove_inbetween!(str, tagset = ['<', '>'])
180
+ def remove_inbetween(str, tagset = ["<", ">"])
283
181
  tagsets = Regexp.quote(tagset.uniq.join(""))
284
182
  regex = /#{Regexp.escape(tagset[0])}[^#{tagsets}]*#{Regexp.escape(tagset[1])}/
285
- str.gsub!(regex, "")
183
+ str.gsub(regex, "")
286
184
  end
287
185
 
288
- def remove_tag!(str)
289
- str.gsub!($remove_tag_regex, "")
186
+ def remove_tag(str)
187
+ str.gsub(REMOVE_TAG_REGEX, "")
290
188
  end
291
189
 
292
- def remove_directive!(str)
293
- str.gsub!($remove_directives_regex, "")
190
+ def remove_directive(str)
191
+ str.gsub(REMOVE_DIRECTIVES_REGEX, "")
294
192
  end
295
193
 
296
- def remove_emphasis!(str)
297
- str.gsub!($remove_emphasis_regex) do
194
+ def remove_emphasis(str)
195
+ str.gsub(REMOVE_EMPHASIS_REGEX) do
298
196
  $2
299
197
  end
300
198
  end
301
199
 
302
- def chrref_to_utf!(num_str)
303
- begin
304
- num_str.gsub!($chrref_to_utf_regex) do
305
- if $1 == 'x'
306
- ch = $2.to_i(16)
307
- else
308
- ch = $2.to_i
309
- end
310
- hi = ch>>8
311
- lo = ch&0xff
312
- u = "\377\376" << lo.chr << hi.chr
313
- u.encode("UTF-8", "UTF-16")
314
- end
315
- rescue StandardError
316
- return nil
200
+ def chrref_to_utf(num_str)
201
+ num_str.gsub(CHRREF_TO_UTF_REGEX) do
202
+ ch = if $1 == "x"
203
+ $2.to_i(16)
204
+ else
205
+ $2.to_i
206
+ end
207
+ hi = ch >> 8
208
+ lo = ch & 0xff
209
+ u = +"\377\376" << lo.chr << hi.chr
210
+ u.encode("UTF-8", "UTF-16")
317
211
  end
318
- return true
212
+ rescue StandardError
213
+ num_str
319
214
  end
320
215
 
321
- def mndash!(str)
322
- str.gsub!($mndash_regex, "–")
216
+ def mndash(str)
217
+ str.gsub(MNDASH_REGEX, "–")
323
218
  end
324
219
 
325
- def remove_hr!(str)
326
- str.gsub!($remove_hr_regex, "")
220
+ def remove_hr(str)
221
+ str.gsub(REMOVE_HR_REGEX, "")
327
222
  end
328
223
 
329
- def remove_ref!(str)
330
- str.gsub!($format_ref_regex){""}
224
+ def remove_ref(str)
225
+ str.gsub(FORMAT_REF_REGEX) { "" }
331
226
  end
332
227
 
333
- def remove_html!(str)
334
- str.gsub!(/<[^<>]+\/>/){""}
228
+ def remove_html(str)
229
+ res = +str.dup
230
+ res.gsub!(%r{<[^<>]+/>}) { "" }
335
231
  ["div", "gallery", "timeline", "noinclude"].each do |tag|
336
- scanner = StringScanner.new(str)
337
- result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do |contents|
232
+ scanner = StringScanner.new(res)
233
+ result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do
338
234
  ""
339
235
  end
340
- str.replace(result)
236
+ res.replace(result)
341
237
  end
238
+ res
342
239
  end
343
240
 
344
- def remove_complex!(str)
345
- str.gsub!($complex_regex_01){"《#{$1}》"}
346
- str.gsub!($complex_regex_02){""}
347
- str.gsub!($complex_regex_03){""}
348
- str.gsub!($complex_regex_04){""}
349
- str.gsub!($complex_regex_05){""}
241
+ def remove_complex(str)
242
+ str = str.gsub(COMPLEX_REGEX_01) { "《#{$1}》" }
243
+ str = str.gsub(COMPLEX_REGEX_02) { "" }
244
+ str = str.gsub(COMPLEX_REGEX_03) { "" }
245
+ str = str.gsub(COMPLEX_REGEX_04) { "" }
246
+ str.gsub(COMPLEX_REGEX_05) { "" }
350
247
  end
351
248
 
352
- def make_reference!(str)
353
- str.gsub!($make_reference_regex_a){"\n"}
354
- str.gsub!($make_reference_regex_b){""}
355
- str.gsub!($make_reference_regex_c){"[ref]"}
356
- str.gsub!($make_reference_regex_d){"[/ref]"}
249
+ def make_reference(str)
250
+ str = str.gsub(MAKE_REFERENCE_REGEX_A) { "\n" }
251
+ str = str.gsub(MAKE_REFERENCE_REGEX_B) { "" }
252
+ str = str.gsub(MAKE_REFERENCE_REGEX_C) { "[ref]" }
253
+ str.gsub(MAKE_REFERENCE_REGEX_D) { "[/ref]" }
357
254
  end
358
255
 
359
- def correct_inline_template!(str)
256
+ def correct_inline_template(str)
360
257
  scanner = StringScanner.new(str)
361
- result = process_nested_structure(scanner, "{{", "}}") do |contents|
258
+ process_nested_structure(scanner, "{{", "}}") do |contents|
362
259
  parts = contents.split("|")
363
260
  if /\A(?:lang|fontsize)\z/i =~ parts[0]
364
261
  parts.shift
365
- elsif /\Alang\-/i =~ parts[0]
262
+ elsif /\Alang-/i =~ parts[0]
366
263
  parts.shift
367
264
  elsif /\Alang=/i =~ parts[1]
368
265
  parts.shift
@@ -373,27 +270,25 @@ module Wp2txt
373
270
  else
374
271
  begin
375
272
  keyval = parts[1].split("=")
376
- if keyval.size > 1
377
- out = keyval[1]
378
- else
379
- out = parts[1] || ""
380
- end
381
- rescue
273
+ out = if keyval.size > 1
274
+ keyval[1]
275
+ else
276
+ parts[1] || ""
277
+ end
278
+ rescue StandardError
382
279
  out = parts[1] || ""
383
280
  end
384
281
  end
385
-
386
282
  out.strip
387
283
  end
388
- str.replace result
389
284
  end
390
285
 
391
- #################### file related utilities ####################
286
+ #################### file related utilities ####################
392
287
 
393
288
  # collect filenames recursively
394
289
  def collect_files(str, regex = nil)
395
290
  regex ||= //
396
- text_array = Array.new
291
+ text_array = []
397
292
  Find.find(str) do |f|
398
293
  text_array << f if regex =~ f
399
294
  end
@@ -401,11 +296,11 @@ module Wp2txt
401
296
  end
402
297
 
403
298
  # modify a file using block/yield mechanism
404
- def file_mod(file_path, backup = false, &block)
299
+ def file_mod(file_path, backup = false)
405
300
  File.open(file_path, "r") do |fr|
406
301
  str = fr.read
407
302
  newstr = yield(str)
408
- str = newstr unless newstr == nil
303
+ str = newstr if nil? newstr
409
304
  File.open("temp", "w") do |tf|
410
305
  tf.write(str)
411
306
  end
@@ -417,32 +312,31 @@ module Wp2txt
417
312
  end
418
313
 
419
314
  # modify files under a directry (recursive)
420
- def batch_file_mod(dir_path, &block)
315
+ def batch_file_mod(dir_path)
421
316
  if FileTest.directory?(dir_path)
422
317
  collect_files(dir_path).each do |file|
423
318
  yield file if FileTest.file?(file)
424
319
  end
425
- else
426
- yield dir_path if FileTest.file?(dir_path)
320
+ elsif FileTest.file?(dir_path)
321
+ yield dir_path
427
322
  end
428
323
  end
429
324
 
430
325
  # take care of difference of separators among environments
431
326
  def correct_separator(input)
432
- if input.is_a?(String)
433
- ret_str = String.new
327
+ case input
328
+ when String
434
329
  if RUBY_PLATFORM.index("win32")
435
- ret_str = input.gsub("/", "\\")
330
+ input.gsub("/", "\\")
436
331
  else
437
- ret_str = input.gsub("\\", "/")
332
+ input.gsub("\\", "/")
438
333
  end
439
- return ret_str
440
- elsif input.is_a?(Array)
441
- ret_array = Array.new
334
+ when Array
335
+ ret_array = []
442
336
  input.each do |item|
443
337
  ret_array << correct_separator(item)
444
338
  end
445
- return ret_array
339
+ ret_array
446
340
  end
447
341
  end
448
342
 
@@ -451,17 +345,14 @@ module Wp2txt
451
345
  maxwidth = 0
452
346
 
453
347
  files.each do |f|
454
- width = f.slice(/\-(\d+)\z/, 1).to_s.length.to_i
348
+ width = f.slice(/-(\d+)\z/, 1).to_s.length.to_i
455
349
  maxwidth = width if maxwidth < width
456
- end
457
-
458
- files.each do |f|
459
- newname= f.sub(/\-(\d+)\z/) do
460
- "-" + sprintf("%0#{maxwidth}d", $1.to_i)
350
+ newname = f.sub(/-(\d+)\z/) do
351
+ "-" + format("%0#{maxwidth}d", $1.to_i)
461
352
  end
462
353
  File.rename(f, newname + ".#{ext}")
463
354
  end
464
- return true
355
+ true
465
356
  end
466
357
 
467
358
  # convert int of seconds to string in the format 00:00:00
@@ -473,8 +364,6 @@ module Wp2txt
473
364
  h = int / 3600
474
365
  m = (int - h * 3600) / 60
475
366
  s = int % 60
476
- str = sprintf("%02d:%02d:%02d", h, m, s)
477
- return str
367
+ format("%02d:%02d:%02d", h, m, s)
478
368
  end
479
-
480
369
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Wp2txt
2
- VERSION = "1.0.2"
4
+ VERSION = "1.1.1"
3
5
  end