wp2txt 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/wp2txt/utils.rb CHANGED
@@ -1,182 +1,87 @@
1
- #!/usr/bin/env ruby
2
- # -*- coding: utf-8 -*-
3
-
4
- require 'strscan'
5
- require 'find'
6
- require 'htmlentities'
7
-
8
- ###################################################
9
- # global variables to save resource for generating regexps
10
- # those with a trailing number 1 represent opening tag/markup
11
- # those with a trailing number 2 represent closing tag/markup
12
- # those without a trailing number contain both opening/closing tags/markups
13
-
14
- $html_decoder = HTMLEntities.new
15
-
16
- $entities = ['&nbsp;', '&lt;', '&gt;', '&amp;', '&quot;'].zip([' ', '<', '>', '&', '"'])
17
- $html_hash = Hash[*$entities.flatten]
18
- $html_regex = Regexp.new("(" + $html_hash.keys.join("|") + ")")
19
- $ml_template_onset_regex = Regexp.new('^\{\{[^\}]*$')
20
- $ml_template_end_regex = Regexp.new('\}\}\s*$')
21
- $ml_link_onset_regex = Regexp.new('^\[\[[^\]]*$')
22
- $ml_linkend_regex = Regexp.new('\]\]\s*$')
23
- $isolated_template_regex = Regexp.new('^\s*\{\{.+\}\}\s*$')
24
- $isolated_tag_regex = Regexp.new('^\s*\<[^\<\>]+\>.+\<[^\<\>]+\>\s*$')
25
- $in_link_regex = Regexp.new('^\s*\[.*\]\s*$')
26
- $in_inputbox_regex = Regexp.new('<inputbox>.*?<\/inputbox>')
27
- $in_inputbox_regex1 = Regexp.new('<inputbox>')
28
- $in_inputbox_regex2 = Regexp.new('<\/inputbox>')
29
- $in_source_regex = Regexp.new('<source.*?>.*?<\/source>')
30
- $in_source_regex1 = Regexp.new('<source.*?>')
31
- $in_source_regex2 = Regexp.new('<\/source>')
32
- $in_math_regex = Regexp.new('<math.*?>.*?<\/math>')
33
- $in_math_regex1 = Regexp.new('<math.*?>')
34
- $in_math_regex2 = Regexp.new('<\/math>')
35
- $in_heading_regex = Regexp.new('^=+.*?=+$')
36
- $in_html_table_regex = Regexp.new('<table.*?><\/table>')
37
- $in_html_table_regex1 = Regexp.new('<table\b')
38
- $in_html_table_regex2 = Regexp.new('<\/\s*table>')
39
- $in_table_regex1 = Regexp.new('^\s*\{\|')
40
- $in_table_regex2 = Regexp.new('^\|\}.*?$')
41
- $in_unordered_regex = Regexp.new('^\*')
42
- $in_ordered_regex = Regexp.new('^\#')
43
- $in_pre_regex = Regexp.new('^ ')
44
- $in_definition_regex = Regexp.new('^[\;\:]')
45
- $blank_line_regex = Regexp.new('^\s*$')
46
- $redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
47
- $remove_tag_regex = Regexp.new("\<[^\<\>]*\>")
48
- $remove_directives_regex = Regexp.new("\_\_[^\_]*\_\_")
49
- $remove_emphasis_regex = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
50
- $chrref_to_utf_regex = Regexp.new('&#(x?)([0-9a-fA-F]+);')
51
- $mndash_regex = Regexp.new('\{(mdash|ndash|–)\}')
52
- $remove_hr_regex = Regexp.new('^\s*\-+\s*$')
53
- $make_reference_regex_a = Regexp.new('<br ?\/>')
54
- $make_reference_regex_b = Regexp.new('<ref[^>]*\/>')
55
- $make_reference_regex_c = Regexp.new('<ref[^>]*>')
56
- $make_reference_regex_d = Regexp.new('<\/ref>')
57
- $format_ref_regex = Regexp.new('\[ref\](.*?)\[\/ref\]', Regexp::MULTILINE)
58
- $heading_onset_regex = Regexp.new('^(\=+)\s+')
59
- $heading_coda_regex = Regexp.new('\s+(\=+)$')
60
- $list_marks_regex = Regexp.new('\A[\*\#\;\:\ ]+')
61
- $pre_marks_regex = Regexp.new('\A\^\ ')
62
- $def_marks_regex = Regexp.new('\A[\;\:\ ]+')
63
- $onset_bar_regex = Regexp.new('\A[^\|]+\z')
64
-
65
- $category_patterns = ["Category", "Categoria"].join("|")
66
- $category_regex = Regexp.new('[\{\[\|\b](?:' + $category_patterns + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
67
-
68
- $escape_nowiki_regex = Regexp.new('<nowiki>(.*?)<\/nowiki>', Regexp::MULTILINE)
69
- $unescape_nowiki_regex = Regexp.new('<nowiki\-(\d+?)>')
70
-
71
- $remove_isolated_regex = Regexp.new('^\s*\{\{(.*?)\}\}\s*$')
72
- $remove_inline_regex = Regexp.new('\{\{(.*?)\}\}')
73
- $type_code_regex = Regexp.new('\A(?:lang*|\AIPA|IEP|SEP|indent|audio|small|dmoz|pron|unicode|note label|nowrap|ArabDIN|trans|Nihongo|Polytonic)', Regexp::IGNORECASE)
74
-
75
- $single_square_bracket_regex = Regexp.new("(#{Regexp.escape('[')}|#{Regexp.escape(']')})", Regexp::MULTILINE)
76
- $double_square_bracket_regex = Regexp.new("(#{Regexp.escape('[[')}|#{Regexp.escape(']]')})", Regexp::MULTILINE)
77
- $single_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{')}|#{Regexp.escape('}')})", Regexp::MULTILINE)
78
- $double_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{{')}|#{Regexp.escape('}}')})", Regexp::MULTILINE)
79
- $curly_square_bracket_regex = Regexp.new("(#{Regexp.escape('{|')}|#{Regexp.escape('|}')})", Regexp::MULTILINE)
80
-
81
- $complex_regex_01 = Regexp.new('\<\<([^<>]++)\>\>\s?')
82
- $complex_regex_02 = Regexp.new('\[\[File\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]', Regexp::MULTILINE | Regexp::IGNORECASE)
83
- $complex_regex_03 = Regexp.new('^\[\[((?:[^\[\]]++|\[\[\g<1>\]\])++)^\]\]', Regexp::MULTILINE)
84
- $complex_regex_04 = Regexp.new('\{\{(?:infobox|efn|sfn|unreliable source|refn|reflist|col(?:umns)?\-list|div col|no col|bar box|formatnum\:|col\||see also\||r\||#)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
85
- $complex_regex_05 = Regexp.new('\{\{[^{}]+?\n\|((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
86
-
87
- $cleanup_regex_01 = Regexp.new('\[ref\]\s*\[\/ref\]', Regexp::MULTILINE)
88
- $cleanup_regex_02 = Regexp.new('^File:.+$')
89
- $cleanup_regex_03 = Regexp.new('^\|.*$')
90
- $cleanup_regex_04 = Regexp.new('\{\{.*$')
91
- $cleanup_regex_05 = Regexp.new('^.*\}\}')
92
- $cleanup_regex_06 = Regexp.new('\{\|.*$')
93
- $cleanup_regex_07 = Regexp.new('^.*\|\}')
94
- $cleanup_regex_08 = Regexp.new('\n\n\n+', Regexp::MULTILINE)
95
-
96
- ###################################################
1
+ # frozen_string_literal: true
97
2
 
98
- module Wp2txt
3
+ require "strscan"
4
+ require "find"
5
+ require_relative "regex"
99
6
 
100
- def convert_characters!(text, has_retried = false)
101
- begin
102
- text << ""
103
- chrref_to_utf!(text)
104
- special_chr!(text)
105
-
106
- rescue # detect invalid byte sequence in UTF-8
107
- if has_retried
108
- puts "invalid byte sequence detected"
109
- puts "******************************"
110
- File.open("error_log.txt", "w") do |f|
111
- f.write text
112
- end
113
- exit
114
- else
115
- text.encode!("UTF-16")
116
- text.encode!("UTF-8")
117
- convert_characters!(text, true)
7
+ module Wp2txt
8
+ def convert_characters(text, has_retried = false)
9
+ text << ""
10
+ text = chrref_to_utf(text)
11
+ text = special_chr(text)
12
+ text = text.encode("UTF-8", "UTF-8", invalid: :replace, replace: "")
13
+ rescue StandardError # detect invalid byte sequence in UTF-8
14
+ if has_retried
15
+ puts "invalid byte sequence detected"
16
+ puts "******************************"
17
+ File.open("error_log.txt", "w") do |f|
18
+ f.write text
118
19
  end
20
+ exit
21
+ else
22
+ text = text.encode("UTF-16", "UTF-16", invalid: :replace, replace: "")
23
+ text = text.encode("UTF-16", "UTF-16", invalid: :replace, replace: "")
24
+ convert_characters(text, true)
119
25
  end
120
26
  end
121
-
122
- def format_wiki!(text, has_retried = false)
123
- remove_complex!(text)
124
-
125
- escape_nowiki!(text)
126
- process_interwiki_links!(text)
127
- process_external_links!(text)
128
- unescape_nowiki!(text)
129
- remove_directive!(text)
130
- remove_emphasis!(text)
131
- mndash!(text)
132
- remove_hr!(text)
133
- remove_tag!(text)
134
- correct_inline_template!(text) unless $leave_inline_template
135
- remove_templates!(text) unless $leave_inline_template
136
- remove_table!(text) unless $leave_table
27
+
28
+ def format_wiki(text, config = {})
29
+ text = remove_complex(text)
30
+ text = escape_nowiki(text)
31
+ text = process_interwiki_links(text)
32
+ text = process_external_links(text)
33
+ text = unescape_nowiki(text)
34
+ text = remove_directive(text)
35
+ text = remove_emphasis(text)
36
+ text = mndash(text)
37
+ text = remove_hr(text)
38
+ text = remove_tag(text)
39
+ text = correct_inline_template(text) unless config[:inline]
40
+ text = remove_templates(text) unless config[:inline]
41
+ text = remove_table(text) unless config[:table]
42
+ text
137
43
  end
138
-
139
- def cleanup!(text)
140
- text.gsub!($cleanup_regex_01){""}
141
- text.gsub!($cleanup_regex_02){""}
142
- text.gsub!($cleanup_regex_03){""}
143
- text.gsub!($cleanup_regex_04){""}
144
- text.gsub!($cleanup_regex_05){""}
145
- text.gsub!($cleanup_regex_06){""}
146
- text.gsub!($cleanup_regex_07){""}
147
- text.gsub!($cleanup_regex_08){"\n\n"}
148
- text.strip!
44
+
45
+ def cleanup(text)
46
+ text = text.gsub(CLEANUP_REGEX_01) { "" }
47
+ text = text.gsub(CLEANUP_REGEX_02) { "" }
48
+ text = text.gsub(CLEANUP_REGEX_03) { "" }
49
+ text = text.gsub(CLEANUP_REGEX_04) { "" }
50
+ text = text.gsub(CLEANUP_REGEX_05) { "" }
51
+ text = text.gsub(CLEANUP_REGEX_06) { "" }
52
+ text = text.gsub(CLEANUP_REGEX_07) { "" }
53
+ text = text.gsub(CLEANUP_REGEX_08) { "\n\n" }
54
+ text = text.strip
149
55
  text << "\n\n"
150
56
  end
151
57
 
152
58
  #################### parser for nested structure ####################
153
-
59
+
154
60
  def process_nested_structure(scanner, left, right, &block)
155
- test = false
156
- buffer = ""
61
+ buffer = +""
157
62
  begin
158
- if left == "[" && right == "]"
159
- regex = $single_square_bracket_regex
160
- elsif left == "[[" && right == "]]"
161
- regex = $double_square_bracket_regex
162
- elsif left == "{" && right == "}"
163
- regex = $single_curly_bracket_regex
164
- elsif left == "{{" && right == "}}"
165
- regex = $double_curly_bracket_regex
166
- elsif left == "{|" && right == "|}"
167
- regex = $curly_square_bracket_regex
168
- else
169
- regex = Regexp.new("(#{Regexp.escape(left)}|#{Regexp.escape(right)})")
170
- end
171
- while str = scanner.scan_until(regex)
63
+ regex = if left == "[" && right == "]"
64
+ SINGLE_SQUARE_BRACKET_REGEX
65
+ elsif left == "[[" && right == "]]"
66
+ DOUBLE_SQUARE_BRACKET_REGEX
67
+ elsif left == "{" && right == "}"
68
+ SINGLE_CURLY_BRACKET_REGEX
69
+ elsif left == "{{" && right == "}}"
70
+ DOUBLE_CURLY_BRACKET_REGEX
71
+ elsif left == "{|" && right == "|}"
72
+ CURLY_SQUARE_BRACKET_REGEX
73
+ else
74
+ Regexp.new("(#{Regexp.escape(left)}|#{Regexp.escape(right)})")
75
+ end
76
+ while (str = scanner.scan_until(regex))
172
77
  case scanner[1]
173
78
  when left
174
79
  buffer << str
175
80
  has_left = true
176
81
  when right
177
82
  if has_left
178
- buffer = buffer[0...-(left.size)]
179
- contents = block.call(str[0...-(left.size)])
83
+ buffer = buffer[0...-left.size]
84
+ contents = block.call(str[0...-left.size])
180
85
  buffer << contents
181
86
  break
182
87
  else
@@ -186,25 +91,23 @@ module Wp2txt
186
91
  end
187
92
  buffer << scanner.rest
188
93
 
189
- if buffer == scanner.string
190
- return buffer
191
- else
192
- scanner.string = buffer
193
- return process_nested_structure(scanner, left, right, &block) || ""
194
- end
195
- rescue => e
196
- return scanner.string
94
+ return buffer if buffer == scanner.string
95
+
96
+ scanner.string = buffer
97
+ process_nested_structure(scanner, left, right, &block) || ""
98
+ rescue StandardError
99
+ scanner.string
197
100
  end
198
- end
101
+ end
199
102
 
200
103
  #################### methods used from format_wiki ####################
201
- def escape_nowiki!(str)
104
+ def escape_nowiki(str)
202
105
  if @nowikis
203
106
  @nowikis.clear
204
107
  else
205
108
  @nowikis = {}
206
109
  end
207
- str.gsub!($escape_nowiki_regex) do
110
+ str.gsub(ESCAPE_NOWIKI_REGEX) do
208
111
  nowiki = $1
209
112
  nowiki_id = nowiki.object_id
210
113
  @nowikis[nowiki_id] = nowiki
@@ -212,17 +115,17 @@ module Wp2txt
212
115
  end
213
116
  end
214
117
 
215
- def unescape_nowiki!(str)
216
- str.gsub!($unescape_nowiki_regex) do
118
+ def unescape_nowiki(str)
119
+ str.gsub(UNESCAPE_NOWIKI_REGEX) do
217
120
  obj_id = $1.to_i
218
121
  @nowikis[obj_id]
219
122
  end
220
123
  end
221
-
222
- def process_interwiki_links!(str)
124
+
125
+ def process_interwiki_links(str)
223
126
  scanner = StringScanner.new(str)
224
- result = process_nested_structure(scanner, "[[", "]]") do |contents|
225
- parts = contents.split("|")
127
+ process_nested_structure(scanner, "[[", "]]") do |contents|
128
+ parts = contents.split("|")
226
129
  case parts.size
227
130
  when 1
228
131
  parts.first || ""
@@ -231,12 +134,11 @@ module Wp2txt
231
134
  parts.join("|")
232
135
  end
233
136
  end
234
- str.replace(result)
235
137
  end
236
138
 
237
- def process_external_links!(str)
139
+ def process_external_links(str)
238
140
  scanner = StringScanner.new(str)
239
- result = process_nested_structure(scanner, "[", "]") do |contents|
141
+ process_nested_structure(scanner, "[", "]") do |contents|
240
142
  if /\A\s.+\s\z/ =~ contents
241
143
  " (#{contents.strip}) "
242
144
  else
@@ -249,119 +151,115 @@ module Wp2txt
249
151
  end
250
152
  end
251
153
  end
252
- str.replace(result)
253
154
  end
254
155
 
255
156
  #################### methods used from format_article ####################
256
157
 
257
- def remove_templates!(str)
258
- scanner = StringScanner.new(str)
259
- result = process_nested_structure(scanner, "{{", "}}") do |contents|
158
+ def remove_templates(str)
159
+ scanner1 = StringScanner.new(str)
160
+ result = process_nested_structure(scanner1, "{{", "}}") do
260
161
  ""
261
162
  end
262
- scanner = StringScanner.new(result)
263
- result = process_nested_structure(scanner, "{", "}") do |contents|
163
+ scanner2 = StringScanner.new(result)
164
+ process_nested_structure(scanner2, "{", "}") do
264
165
  ""
265
166
  end
266
- str.replace(result)
267
167
  end
268
-
269
- def remove_table!(str)
168
+
169
+ def remove_table(str)
270
170
  scanner = StringScanner.new(str)
271
- result = process_nested_structure(scanner, "{|", "|}") do |contents|
171
+ process_nested_structure(scanner, "{|", "|}") do
272
172
  ""
273
173
  end
274
- str.replace(result)
275
174
  end
276
-
277
- def special_chr!(str)
278
- str.replace $html_decoder.decode(str)
175
+
176
+ def special_chr(str)
177
+ HTML_DECODER.decode(str)
279
178
  end
280
179
 
281
- def remove_inbetween!(str, tagset = ['<', '>'])
180
+ def remove_inbetween(str, tagset = ["<", ">"])
282
181
  tagsets = Regexp.quote(tagset.uniq.join(""))
283
182
  regex = /#{Regexp.escape(tagset[0])}[^#{tagsets}]*#{Regexp.escape(tagset[1])}/
284
- str.gsub!(regex, "")
183
+ str.gsub(regex, "")
285
184
  end
286
185
 
287
- def remove_tag!(str)
288
- str.gsub!($remove_tag_regex, "")
186
+ def remove_tag(str)
187
+ str.gsub(REMOVE_TAG_REGEX, "")
289
188
  end
290
189
 
291
- def remove_directive!(str)
292
- str.gsub!($remove_directives_regex, "")
190
+ def remove_directive(str)
191
+ str.gsub(REMOVE_DIRECTIVES_REGEX, "")
293
192
  end
294
193
 
295
- def remove_emphasis!(str)
296
- str.gsub!($remove_emphasis_regex) do
194
+ def remove_emphasis(str)
195
+ str.gsub(REMOVE_EMPHASIS_REGEX) do
297
196
  $2
298
197
  end
299
198
  end
300
199
 
301
- def chrref_to_utf!(num_str)
302
- begin
303
- num_str.gsub!($chrref_to_utf_regex) do
304
- if $1 == 'x'
305
- ch = $2.to_i(16)
306
- else
307
- ch = $2.to_i
308
- end
309
- hi = ch>>8
310
- lo = ch&0xff
311
- u = "\377\376" << lo.chr << hi.chr
312
- u.encode("UTF-8", "UTF-16")
313
- end
314
- rescue StandardError
315
- return nil
200
+ def chrref_to_utf(num_str)
201
+ num_str.gsub(CHRREF_TO_UTF_REGEX) do
202
+ ch = if $1 == "x"
203
+ $2.to_i(16)
204
+ else
205
+ $2.to_i
206
+ end
207
+ hi = ch >> 8
208
+ lo = ch & 0xff
209
+ u = +"\377\376" << lo.chr << hi.chr
210
+ u.encode("UTF-8", "UTF-16")
316
211
  end
317
- return true
212
+ rescue StandardError
213
+ num_str
318
214
  end
319
-
320
- def mndash!(str)
321
- str.gsub!($mndash_regex, "–")
215
+
216
+ def mndash(str)
217
+ str.gsub(MNDASH_REGEX, "–")
322
218
  end
323
219
 
324
- def remove_hr!(str)
325
- str.gsub!($remove_hr_regex, "")
220
+ def remove_hr(str)
221
+ str.gsub(REMOVE_HR_REGEX, "")
326
222
  end
327
223
 
328
- def remove_ref!(str)
329
- str.gsub!($format_ref_regex){""}
224
+ def remove_ref(str)
225
+ str.gsub(FORMAT_REF_REGEX) { "" }
330
226
  end
331
227
 
332
- def remove_html!(str)
333
- str.gsub!(/<[^<>]+\/>/){""}
228
+ def remove_html(str)
229
+ res = +str.dup
230
+ res.gsub!(%r{<[^<>]+/>}) { "" }
334
231
  ["div", "gallery", "timeline", "noinclude"].each do |tag|
335
- scanner = StringScanner.new(str)
336
- result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do |contents|
232
+ scanner = StringScanner.new(res)
233
+ result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do
337
234
  ""
338
235
  end
339
- str.replace(result)
236
+ res.replace(result)
340
237
  end
238
+ res
341
239
  end
342
240
 
343
- def remove_complex!(str)
344
- str.gsub!($complex_regex_01){"《#{$1}》"}
345
- str.gsub!($complex_regex_02){""}
346
- str.gsub!($complex_regex_03){""}
347
- str.gsub!($complex_regex_04){""}
348
- str.gsub!($complex_regex_05){""}
241
+ def remove_complex(str)
242
+ str = str.gsub(COMPLEX_REGEX_01) { "《#{$1}》" }
243
+ str = str.gsub(COMPLEX_REGEX_02) { "" }
244
+ str = str.gsub(COMPLEX_REGEX_03) { "" }
245
+ str = str.gsub(COMPLEX_REGEX_04) { "" }
246
+ str.gsub(COMPLEX_REGEX_05) { "" }
349
247
  end
350
-
351
- def make_reference!(str)
352
- str.gsub!($make_reference_regex_a){"\n"}
353
- str.gsub!($make_reference_regex_b){""}
354
- str.gsub!($make_reference_regex_c){"[ref]"}
355
- str.gsub!($make_reference_regex_d){"[/ref]"}
248
+
249
+ def make_reference(str)
250
+ str = str.gsub(MAKE_REFERENCE_REGEX_A) { "\n" }
251
+ str = str.gsub(MAKE_REFERENCE_REGEX_B) { "" }
252
+ str = str.gsub(MAKE_REFERENCE_REGEX_C) { "[ref]" }
253
+ str.gsub(MAKE_REFERENCE_REGEX_D) { "[/ref]" }
356
254
  end
357
255
 
358
- def correct_inline_template!(str)
256
+ def correct_inline_template(str)
359
257
  scanner = StringScanner.new(str)
360
- result = process_nested_structure(scanner, "{{", "}}") do |contents|
258
+ process_nested_structure(scanner, "{{", "}}") do |contents|
361
259
  parts = contents.split("|")
362
260
  if /\A(?:lang|fontsize)\z/i =~ parts[0]
363
261
  parts.shift
364
- elsif /\Alang\-/i =~ parts[0]
262
+ elsif /\Alang-/i =~ parts[0]
365
263
  parts.shift
366
264
  elsif /\Alang=/i =~ parts[1]
367
265
  parts.shift
@@ -372,27 +270,25 @@ module Wp2txt
372
270
  else
373
271
  begin
374
272
  keyval = parts[1].split("=")
375
- if keyval.size > 1
376
- out = keyval[1]
377
- else
378
- out = parts[1] || ""
379
- end
380
- rescue
273
+ out = if keyval.size > 1
274
+ keyval[1]
275
+ else
276
+ parts[1] || ""
277
+ end
278
+ rescue StandardError
381
279
  out = parts[1] || ""
382
280
  end
383
281
  end
384
-
385
282
  out.strip
386
283
  end
387
- str.replace result
388
284
  end
389
285
 
390
- #################### file related utilities ####################
286
+ #################### file related utilities ####################
391
287
 
392
288
  # collect filenames recursively
393
289
  def collect_files(str, regex = nil)
394
290
  regex ||= //
395
- text_array = Array.new
291
+ text_array = []
396
292
  Find.find(str) do |f|
397
293
  text_array << f if regex =~ f
398
294
  end
@@ -400,11 +296,11 @@ module Wp2txt
400
296
  end
401
297
 
402
298
  # modify a file using block/yield mechanism
403
- def file_mod(file_path, backup = false, &block)
299
+ def file_mod(file_path, backup = false)
404
300
  File.open(file_path, "r") do |fr|
405
301
  str = fr.read
406
302
  newstr = yield(str)
407
- str = newstr unless newstr == nil
303
+ str = newstr if nil? newstr
408
304
  File.open("temp", "w") do |tf|
409
305
  tf.write(str)
410
306
  end
@@ -413,54 +309,50 @@ module Wp2txt
413
309
  File.rename(file_path, file_path + ".bak")
414
310
  File.rename("temp", file_path)
415
311
  File.unlink(file_path + ".bak") unless backup
416
- end
312
+ end
417
313
 
418
314
  # modify files under a directry (recursive)
419
- def batch_file_mod(dir_path, &block)
315
+ def batch_file_mod(dir_path)
420
316
  if FileTest.directory?(dir_path)
421
317
  collect_files(dir_path).each do |file|
422
318
  yield file if FileTest.file?(file)
423
319
  end
424
- else
425
- yield dir_path if FileTest.file?(dir_path)
320
+ elsif FileTest.file?(dir_path)
321
+ yield dir_path
426
322
  end
427
323
  end
428
324
 
429
325
  # take care of difference of separators among environments
430
326
  def correct_separator(input)
431
- if input.is_a?(String)
432
- ret_str = String.new
327
+ case input
328
+ when String
433
329
  if RUBY_PLATFORM.index("win32")
434
- ret_str = input.gsub("/", "\\")
330
+ input.gsub("/", "\\")
435
331
  else
436
- ret_str = input.gsub("\\", "/")
332
+ input.gsub("\\", "/")
437
333
  end
438
- return ret_str
439
- elsif input.is_a?(Array)
440
- ret_array = Array.new
334
+ when Array
335
+ ret_array = []
441
336
  input.each do |item|
442
337
  ret_array << correct_separator(item)
443
338
  end
444
- return ret_array
339
+ ret_array
445
340
  end
446
341
  end
447
342
 
448
- def rename(files, ext = "txt")
343
+ def rename(files, ext = "txt")
449
344
  # num of digits necessary to name the last file generated
450
- maxwidth = 0
345
+ maxwidth = 0
451
346
 
452
347
  files.each do |f|
453
- width = f.slice(/\-(\d+)\z/, 1).to_s.length.to_i
348
+ width = f.slice(/-(\d+)\z/, 1).to_s.length.to_i
454
349
  maxwidth = width if maxwidth < width
455
- end
456
-
457
- files.each do |f|
458
- newname= f.sub(/\-(\d+)\z/) do
459
- "-" + sprintf("%0#{maxwidth}d", $1.to_i)
350
+ newname = f.sub(/-(\d+)\z/) do
351
+ "-" + format("%0#{maxwidth}d", $1.to_i)
460
352
  end
461
353
  File.rename(f, newname + ".#{ext}")
462
354
  end
463
- return true
355
+ true
464
356
  end
465
357
 
466
358
  # convert int of seconds to string in the format 00:00:00
@@ -472,8 +364,6 @@ module Wp2txt
472
364
  h = int / 3600
473
365
  m = (int - h * 3600) / 60
474
366
  s = int % 60
475
- str = sprintf("%02d:%02d:%02d", h, m, s)
476
- return str
367
+ format("%02d:%02d:%02d", h, m, s)
477
368
  end
478
-
479
369
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Wp2txt
2
- VERSION = "1.0.1"
4
+ VERSION = "1.1.0"
3
5
  end