wp2txt 1.0.2 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/wp2txt/utils.rb CHANGED
@@ -1,183 +1,87 @@
1
- #!/usr/bin/env ruby
2
- # -*- coding: utf-8 -*-
3
-
4
- require 'strscan'
5
- require 'find'
6
- require 'htmlentities'
7
-
8
- ###################################################
9
- # global variables to save resource for generating regexps
10
- # those with a trailing number 1 represent opening tag/markup
11
- # those with a trailing number 2 represent closing tag/markup
12
- # those without a trailing number contain both opening/closing tags/markups
13
-
14
- $html_decoder = HTMLEntities.new
15
-
16
- $entities = ['&nbsp;', '&lt;', '&gt;', '&amp;', '&quot;'].zip([' ', '<', '>', '&', '"'])
17
- $html_hash = Hash[*$entities.flatten]
18
- $html_regex = Regexp.new("(" + $html_hash.keys.join("|") + ")")
19
- $ml_template_onset_regex = Regexp.new('^\{\{[^\}]*$')
20
- $ml_template_end_regex = Regexp.new('\}\}\s*$')
21
- $ml_link_onset_regex = Regexp.new('^\[\[[^\]]*$')
22
- $ml_linkend_regex = Regexp.new('\]\]\s*$')
23
- $isolated_template_regex = Regexp.new('^\s*\{\{.+\}\}\s*$')
24
- $isolated_tag_regex = Regexp.new('^\s*\<[^\<\>]+\>.+\<[^\<\>]+\>\s*$')
25
- $in_link_regex = Regexp.new('^\s*\[.*\]\s*$')
26
- $in_inputbox_regex = Regexp.new('<inputbox>.*?<\/inputbox>')
27
- $in_inputbox_regex1 = Regexp.new('<inputbox>')
28
- $in_inputbox_regex2 = Regexp.new('<\/inputbox>')
29
- $in_source_regex = Regexp.new('<source.*?>.*?<\/source>')
30
- $in_source_regex1 = Regexp.new('<source.*?>')
31
- $in_source_regex2 = Regexp.new('<\/source>')
32
- $in_math_regex = Regexp.new('<math.*?>.*?<\/math>')
33
- $in_math_regex1 = Regexp.new('<math.*?>')
34
- $in_math_regex2 = Regexp.new('<\/math>')
35
- $in_heading_regex = Regexp.new('^=+.*?=+$')
36
- $in_html_table_regex = Regexp.new('<table.*?><\/table>')
37
- $in_html_table_regex1 = Regexp.new('<table\b')
38
- $in_html_table_regex2 = Regexp.new('<\/\s*table>')
39
- $in_table_regex1 = Regexp.new('^\s*\{\|')
40
- $in_table_regex2 = Regexp.new('^\|\}.*?$')
41
- $in_unordered_regex = Regexp.new('^\*')
42
- $in_ordered_regex = Regexp.new('^\#')
43
- $in_pre_regex = Regexp.new('^ ')
44
- $in_definition_regex = Regexp.new('^[\;\:]')
45
- $blank_line_regex = Regexp.new('^\s*$')
46
- $redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
47
- $remove_tag_regex = Regexp.new("\<[^\<\>]*\>")
48
- $remove_directives_regex = Regexp.new("\_\_[^\_]*\_\_")
49
- $remove_emphasis_regex = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
50
- $chrref_to_utf_regex = Regexp.new('&#(x?)([0-9a-fA-F]+);')
51
- $mndash_regex = Regexp.new('\{(mdash|ndash|–)\}')
52
- $remove_hr_regex = Regexp.new('^\s*\-+\s*$')
53
- $make_reference_regex_a = Regexp.new('<br ?\/>')
54
- $make_reference_regex_b = Regexp.new('<ref[^>]*\/>')
55
- $make_reference_regex_c = Regexp.new('<ref[^>]*>')
56
- $make_reference_regex_d = Regexp.new('<\/ref>')
57
- $format_ref_regex = Regexp.new('\[ref\](.*?)\[\/ref\]', Regexp::MULTILINE)
58
- $heading_onset_regex = Regexp.new('^(\=+)\s+')
59
- $heading_coda_regex = Regexp.new('\s+(\=+)$')
60
- $list_marks_regex = Regexp.new('\A[\*\#\;\:\ ]+')
61
- $pre_marks_regex = Regexp.new('\A\^\ ')
62
- $def_marks_regex = Regexp.new('\A[\;\:\ ]+')
63
- $onset_bar_regex = Regexp.new('\A[^\|]+\z')
64
-
65
- $category_patterns = ["Category", "Categoria"].join("|")
66
- $category_regex = Regexp.new('[\{\[\|\b](?:' + $category_patterns + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
67
-
68
- $escape_nowiki_regex = Regexp.new('<nowiki>(.*?)<\/nowiki>', Regexp::MULTILINE)
69
- $unescape_nowiki_regex = Regexp.new('<nowiki\-(\d+?)>')
70
-
71
- $remove_isolated_regex = Regexp.new('^\s*\{\{(.*?)\}\}\s*$')
72
- $remove_inline_regex = Regexp.new('\{\{(.*?)\}\}')
73
- $type_code_regex = Regexp.new('\A(?:lang*|\AIPA|IEP|SEP|indent|audio|small|dmoz|pron|unicode|note label|nowrap|ArabDIN|trans|Nihongo|Polytonic)', Regexp::IGNORECASE)
74
-
75
- $single_square_bracket_regex = Regexp.new("(#{Regexp.escape('[')}|#{Regexp.escape(']')})", Regexp::MULTILINE)
76
- $double_square_bracket_regex = Regexp.new("(#{Regexp.escape('[[')}|#{Regexp.escape(']]')})", Regexp::MULTILINE)
77
- $single_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{')}|#{Regexp.escape('}')})", Regexp::MULTILINE)
78
- $double_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{{')}|#{Regexp.escape('}}')})", Regexp::MULTILINE)
79
- $curly_square_bracket_regex = Regexp.new("(#{Regexp.escape('{|')}|#{Regexp.escape('|}')})", Regexp::MULTILINE)
80
-
81
- $complex_regex_01 = Regexp.new('\<\<([^<>]++)\>\>\s?')
82
- $complex_regex_02 = Regexp.new('\[\[File\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]', Regexp::MULTILINE | Regexp::IGNORECASE)
83
- $complex_regex_03 = Regexp.new('^\[\[((?:[^\[\]]++|\[\[\g<1>\]\])++)^\]\]', Regexp::MULTILINE)
84
- $complex_regex_04 = Regexp.new('\{\{(?:infobox|efn|sfn|unreliable source|refn|reflist|col(?:umns)?\-list|div col|no col|bar box|formatnum\:|col\||see also\||r\||#)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
85
- $complex_regex_05 = Regexp.new('\{\{[^{}]+?\n\|((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
86
-
87
- $cleanup_regex_01 = Regexp.new('\[ref\]\s*\[\/ref\]', Regexp::MULTILINE)
88
- $cleanup_regex_02 = Regexp.new('^File:.+$')
89
- $cleanup_regex_03 = Regexp.new('^\|.*$')
90
- $cleanup_regex_04 = Regexp.new('\{\{.*$')
91
- $cleanup_regex_05 = Regexp.new('^.*\}\}')
92
- $cleanup_regex_06 = Regexp.new('\{\|.*$')
93
- $cleanup_regex_07 = Regexp.new('^.*\|\}')
94
- $cleanup_regex_08 = Regexp.new('\n\n\n+', Regexp::MULTILINE)
95
-
96
- ###################################################
1
+ # frozen_string_literal: true
97
2
 
98
- module Wp2txt
3
+ require "strscan"
4
+ require "find"
5
+ require_relative "regex"
99
6
 
100
- def convert_characters!(text, has_retried = false)
101
- begin
102
- text << ""
103
- chrref_to_utf!(text)
104
- special_chr!(text)
105
- text.encode!("UTF-8", "UTF-8", invalid: :replace, replace: "")
106
-
107
- rescue # detect invalid byte sequence in UTF-8
108
- if has_retried
109
- puts "invalid byte sequence detected"
110
- puts "******************************"
111
- File.open("error_log.txt", "w") do |f|
112
- f.write text
113
- end
114
- exit
115
- else
116
- text.encode!("UTF-16", "UTF-16", invalid: :replace, replace: "")
117
- text.encode!("UTF-16", "UTF-16", invalid: :replace, replace: "")
118
- convert_characters!(text, true)
7
+ module Wp2txt
8
+ def convert_characters(text, has_retried = false)
9
+ text << ""
10
+ text = chrref_to_utf(text)
11
+ text = special_chr(text)
12
+ text = text.encode("UTF-8", "UTF-8", invalid: :replace, replace: "")
13
+ rescue StandardError # detect invalid byte sequence in UTF-8
14
+ if has_retried
15
+ puts "invalid byte sequence detected"
16
+ puts "******************************"
17
+ File.open("error_log.txt", "w") do |f|
18
+ f.write text
119
19
  end
20
+ exit
21
+ else
22
+ text = text.encode("UTF-16", "UTF-16", invalid: :replace, replace: "")
23
+ text = text.encode("UTF-16", "UTF-16", invalid: :replace, replace: "")
24
+ convert_characters(text, true)
120
25
  end
121
26
  end
122
27
 
123
- def format_wiki!(text, has_retried = false)
124
- remove_complex!(text)
125
-
126
- escape_nowiki!(text)
127
- process_interwiki_links!(text)
128
- process_external_links!(text)
129
- unescape_nowiki!(text)
130
- remove_directive!(text)
131
- remove_emphasis!(text)
132
- mndash!(text)
133
- remove_hr!(text)
134
- remove_tag!(text)
135
- correct_inline_template!(text) unless $leave_inline_template
136
- remove_templates!(text) unless $leave_inline_template
137
- remove_table!(text) unless $leave_table
28
+ def format_wiki(text, config = {})
29
+ text = remove_complex(text)
30
+ text = escape_nowiki(text)
31
+ text = process_interwiki_links(text)
32
+ text = process_external_links(text)
33
+ text = unescape_nowiki(text)
34
+ text = remove_directive(text)
35
+ text = remove_emphasis(text)
36
+ text = mndash(text)
37
+ text = remove_hr(text)
38
+ text = remove_tag(text)
39
+ text = correct_inline_template(text) unless config[:inline]
40
+ text = remove_templates(text) unless config[:inline]
41
+ text = remove_table(text) unless config[:table]
42
+ text
138
43
  end
139
44
 
140
- def cleanup!(text)
141
- text.gsub!($cleanup_regex_01){""}
142
- text.gsub!($cleanup_regex_02){""}
143
- text.gsub!($cleanup_regex_03){""}
144
- text.gsub!($cleanup_regex_04){""}
145
- text.gsub!($cleanup_regex_05){""}
146
- text.gsub!($cleanup_regex_06){""}
147
- text.gsub!($cleanup_regex_07){""}
148
- text.gsub!($cleanup_regex_08){"\n\n"}
149
- text.strip!
45
+ def cleanup(text)
46
+ text = text.gsub(CLEANUP_REGEX_01) { "" }
47
+ text = text.gsub(CLEANUP_REGEX_02) { "" }
48
+ text = text.gsub(CLEANUP_REGEX_03) { "" }
49
+ text = text.gsub(CLEANUP_REGEX_04) { "" }
50
+ text = text.gsub(CLEANUP_REGEX_05) { "" }
51
+ text = text.gsub(CLEANUP_REGEX_06) { "" }
52
+ text = text.gsub(CLEANUP_REGEX_07) { "" }
53
+ text = text.gsub(CLEANUP_REGEX_08) { "\n\n" }
54
+ text = text.strip
150
55
  text << "\n\n"
151
56
  end
152
57
 
153
58
  #################### parser for nested structure ####################
154
59
 
155
60
  def process_nested_structure(scanner, left, right, &block)
156
- test = false
157
- buffer = ""
61
+ buffer = +""
158
62
  begin
159
- if left == "[" && right == "]"
160
- regex = $single_square_bracket_regex
161
- elsif left == "[[" && right == "]]"
162
- regex = $double_square_bracket_regex
163
- elsif left == "{" && right == "}"
164
- regex = $single_curly_bracket_regex
165
- elsif left == "{{" && right == "}}"
166
- regex = $double_curly_bracket_regex
167
- elsif left == "{|" && right == "|}"
168
- regex = $curly_square_bracket_regex
169
- else
170
- regex = Regexp.new("(#{Regexp.escape(left)}|#{Regexp.escape(right)})")
171
- end
172
- while str = scanner.scan_until(regex)
63
+ regex = if left == "[" && right == "]"
64
+ SINGLE_SQUARE_BRACKET_REGEX
65
+ elsif left == "[[" && right == "]]"
66
+ DOUBLE_SQUARE_BRACKET_REGEX
67
+ elsif left == "{" && right == "}"
68
+ SINGLE_CURLY_BRACKET_REGEX
69
+ elsif left == "{{" && right == "}}"
70
+ DOUBLE_CURLY_BRACKET_REGEX
71
+ elsif left == "{|" && right == "|}"
72
+ CURLY_SQUARE_BRACKET_REGEX
73
+ else
74
+ Regexp.new("(#{Regexp.escape(left)}|#{Regexp.escape(right)})")
75
+ end
76
+ while (str = scanner.scan_until(regex))
173
77
  case scanner[1]
174
78
  when left
175
79
  buffer << str
176
80
  has_left = true
177
81
  when right
178
82
  if has_left
179
- buffer = buffer[0...-(left.size)]
180
- contents = block.call(str[0...-(left.size)])
83
+ buffer = buffer[0...-left.size]
84
+ contents = block.call(str[0...-left.size])
181
85
  buffer << contents
182
86
  break
183
87
  else
@@ -187,25 +91,23 @@ module Wp2txt
187
91
  end
188
92
  buffer << scanner.rest
189
93
 
190
- if buffer == scanner.string
191
- return buffer
192
- else
193
- scanner.string = buffer
194
- return process_nested_structure(scanner, left, right, &block) || ""
195
- end
196
- rescue => e
197
- return scanner.string
94
+ return buffer if buffer == scanner.string
95
+
96
+ scanner.string = buffer
97
+ process_nested_structure(scanner, left, right, &block) || ""
98
+ rescue StandardError
99
+ scanner.string
198
100
  end
199
101
  end
200
102
 
201
103
  #################### methods used from format_wiki ####################
202
- def escape_nowiki!(str)
104
+ def escape_nowiki(str)
203
105
  if @nowikis
204
106
  @nowikis.clear
205
107
  else
206
108
  @nowikis = {}
207
109
  end
208
- str.gsub!($escape_nowiki_regex) do
110
+ str.gsub(ESCAPE_NOWIKI_REGEX) do
209
111
  nowiki = $1
210
112
  nowiki_id = nowiki.object_id
211
113
  @nowikis[nowiki_id] = nowiki
@@ -213,16 +115,16 @@ module Wp2txt
213
115
  end
214
116
  end
215
117
 
216
- def unescape_nowiki!(str)
217
- str.gsub!($unescape_nowiki_regex) do
118
+ def unescape_nowiki(str)
119
+ str.gsub(UNESCAPE_NOWIKI_REGEX) do
218
120
  obj_id = $1.to_i
219
121
  @nowikis[obj_id]
220
122
  end
221
123
  end
222
124
 
223
- def process_interwiki_links!(str)
125
+ def process_interwiki_links(str)
224
126
  scanner = StringScanner.new(str)
225
- result = process_nested_structure(scanner, "[[", "]]") do |contents|
127
+ process_nested_structure(scanner, "[[", "]]") do |contents|
226
128
  parts = contents.split("|")
227
129
  case parts.size
228
130
  when 1
@@ -232,12 +134,11 @@ module Wp2txt
232
134
  parts.join("|")
233
135
  end
234
136
  end
235
- str.replace(result)
236
137
  end
237
138
 
238
- def process_external_links!(str)
139
+ def process_external_links(str)
239
140
  scanner = StringScanner.new(str)
240
- result = process_nested_structure(scanner, "[", "]") do |contents|
141
+ process_nested_structure(scanner, "[", "]") do |contents|
241
142
  if /\A\s.+\s\z/ =~ contents
242
143
  " (#{contents.strip}) "
243
144
  else
@@ -250,119 +151,115 @@ module Wp2txt
250
151
  end
251
152
  end
252
153
  end
253
- str.replace(result)
254
154
  end
255
155
 
256
156
  #################### methods used from format_article ####################
257
157
 
258
- def remove_templates!(str)
259
- scanner = StringScanner.new(str)
260
- result = process_nested_structure(scanner, "{{", "}}") do |contents|
158
+ def remove_templates(str)
159
+ scanner1 = StringScanner.new(str)
160
+ result = process_nested_structure(scanner1, "{{", "}}") do
261
161
  ""
262
162
  end
263
- scanner = StringScanner.new(result)
264
- result = process_nested_structure(scanner, "{", "}") do |contents|
163
+ scanner2 = StringScanner.new(result)
164
+ process_nested_structure(scanner2, "{", "}") do
265
165
  ""
266
166
  end
267
- str.replace(result)
268
167
  end
269
168
 
270
- def remove_table!(str)
169
+ def remove_table(str)
271
170
  scanner = StringScanner.new(str)
272
- result = process_nested_structure(scanner, "{|", "|}") do |contents|
171
+ process_nested_structure(scanner, "{|", "|}") do
273
172
  ""
274
173
  end
275
- str.replace(result)
276
174
  end
277
175
 
278
- def special_chr!(str)
279
- str.replace $html_decoder.decode(str)
176
+ def special_chr(str)
177
+ HTML_DECODER.decode(str)
280
178
  end
281
179
 
282
- def remove_inbetween!(str, tagset = ['<', '>'])
180
+ def remove_inbetween(str, tagset = ["<", ">"])
283
181
  tagsets = Regexp.quote(tagset.uniq.join(""))
284
182
  regex = /#{Regexp.escape(tagset[0])}[^#{tagsets}]*#{Regexp.escape(tagset[1])}/
285
- str.gsub!(regex, "")
183
+ str.gsub(regex, "")
286
184
  end
287
185
 
288
- def remove_tag!(str)
289
- str.gsub!($remove_tag_regex, "")
186
+ def remove_tag(str)
187
+ str.gsub(REMOVE_TAG_REGEX, "")
290
188
  end
291
189
 
292
- def remove_directive!(str)
293
- str.gsub!($remove_directives_regex, "")
190
+ def remove_directive(str)
191
+ str.gsub(REMOVE_DIRECTIVES_REGEX, "")
294
192
  end
295
193
 
296
- def remove_emphasis!(str)
297
- str.gsub!($remove_emphasis_regex) do
194
+ def remove_emphasis(str)
195
+ str.gsub(REMOVE_EMPHASIS_REGEX) do
298
196
  $2
299
197
  end
300
198
  end
301
199
 
302
- def chrref_to_utf!(num_str)
303
- begin
304
- num_str.gsub!($chrref_to_utf_regex) do
305
- if $1 == 'x'
306
- ch = $2.to_i(16)
307
- else
308
- ch = $2.to_i
309
- end
310
- hi = ch>>8
311
- lo = ch&0xff
312
- u = "\377\376" << lo.chr << hi.chr
313
- u.encode("UTF-8", "UTF-16")
314
- end
315
- rescue StandardError
316
- return nil
200
+ def chrref_to_utf(num_str)
201
+ num_str.gsub(CHRREF_TO_UTF_REGEX) do
202
+ ch = if $1 == "x"
203
+ $2.to_i(16)
204
+ else
205
+ $2.to_i
206
+ end
207
+ hi = ch >> 8
208
+ lo = ch & 0xff
209
+ u = +"\377\376" << lo.chr << hi.chr
210
+ u.encode("UTF-8", "UTF-16")
317
211
  end
318
- return true
212
+ rescue StandardError
213
+ num_str
319
214
  end
320
215
 
321
- def mndash!(str)
322
- str.gsub!($mndash_regex, "–")
216
+ def mndash(str)
217
+ str.gsub(MNDASH_REGEX, "–")
323
218
  end
324
219
 
325
- def remove_hr!(str)
326
- str.gsub!($remove_hr_regex, "")
220
+ def remove_hr(str)
221
+ str.gsub(REMOVE_HR_REGEX, "")
327
222
  end
328
223
 
329
- def remove_ref!(str)
330
- str.gsub!($format_ref_regex){""}
224
+ def remove_ref(str)
225
+ str.gsub(FORMAT_REF_REGEX) { "" }
331
226
  end
332
227
 
333
- def remove_html!(str)
334
- str.gsub!(/<[^<>]+\/>/){""}
228
+ def remove_html(str)
229
+ res = +str.dup
230
+ res.gsub!(%r{<[^<>]+/>}) { "" }
335
231
  ["div", "gallery", "timeline", "noinclude"].each do |tag|
336
- scanner = StringScanner.new(str)
337
- result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do |contents|
232
+ scanner = StringScanner.new(res)
233
+ result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do
338
234
  ""
339
235
  end
340
- str.replace(result)
236
+ res.replace(result)
341
237
  end
238
+ res
342
239
  end
343
240
 
344
- def remove_complex!(str)
345
- str.gsub!($complex_regex_01){"《#{$1}》"}
346
- str.gsub!($complex_regex_02){""}
347
- str.gsub!($complex_regex_03){""}
348
- str.gsub!($complex_regex_04){""}
349
- str.gsub!($complex_regex_05){""}
241
+ def remove_complex(str)
242
+ str = str.gsub(COMPLEX_REGEX_01) { "《#{$1}》" }
243
+ str = str.gsub(COMPLEX_REGEX_02) { "" }
244
+ str = str.gsub(COMPLEX_REGEX_03) { "" }
245
+ str = str.gsub(COMPLEX_REGEX_04) { "" }
246
+ str.gsub(COMPLEX_REGEX_05) { "" }
350
247
  end
351
248
 
352
- def make_reference!(str)
353
- str.gsub!($make_reference_regex_a){"\n"}
354
- str.gsub!($make_reference_regex_b){""}
355
- str.gsub!($make_reference_regex_c){"[ref]"}
356
- str.gsub!($make_reference_regex_d){"[/ref]"}
249
+ def make_reference(str)
250
+ str = str.gsub(MAKE_REFERENCE_REGEX_A) { "\n" }
251
+ str = str.gsub(MAKE_REFERENCE_REGEX_B) { "" }
252
+ str = str.gsub(MAKE_REFERENCE_REGEX_C) { "[ref]" }
253
+ str.gsub(MAKE_REFERENCE_REGEX_D) { "[/ref]" }
357
254
  end
358
255
 
359
- def correct_inline_template!(str)
256
+ def correct_inline_template(str)
360
257
  scanner = StringScanner.new(str)
361
- result = process_nested_structure(scanner, "{{", "}}") do |contents|
258
+ process_nested_structure(scanner, "{{", "}}") do |contents|
362
259
  parts = contents.split("|")
363
260
  if /\A(?:lang|fontsize)\z/i =~ parts[0]
364
261
  parts.shift
365
- elsif /\Alang\-/i =~ parts[0]
262
+ elsif /\Alang-/i =~ parts[0]
366
263
  parts.shift
367
264
  elsif /\Alang=/i =~ parts[1]
368
265
  parts.shift
@@ -373,27 +270,25 @@ module Wp2txt
373
270
  else
374
271
  begin
375
272
  keyval = parts[1].split("=")
376
- if keyval.size > 1
377
- out = keyval[1]
378
- else
379
- out = parts[1] || ""
380
- end
381
- rescue
273
+ out = if keyval.size > 1
274
+ keyval[1]
275
+ else
276
+ parts[1] || ""
277
+ end
278
+ rescue StandardError
382
279
  out = parts[1] || ""
383
280
  end
384
281
  end
385
-
386
282
  out.strip
387
283
  end
388
- str.replace result
389
284
  end
390
285
 
391
- #################### file related utilities ####################
286
+ #################### file related utilities ####################
392
287
 
393
288
  # collect filenames recursively
394
289
  def collect_files(str, regex = nil)
395
290
  regex ||= //
396
- text_array = Array.new
291
+ text_array = []
397
292
  Find.find(str) do |f|
398
293
  text_array << f if regex =~ f
399
294
  end
@@ -401,11 +296,11 @@ module Wp2txt
401
296
  end
402
297
 
403
298
  # modify a file using block/yield mechanism
404
- def file_mod(file_path, backup = false, &block)
299
+ def file_mod(file_path, backup = false)
405
300
  File.open(file_path, "r") do |fr|
406
301
  str = fr.read
407
302
  newstr = yield(str)
408
- str = newstr unless newstr == nil
303
+ str = newstr if nil? newstr
409
304
  File.open("temp", "w") do |tf|
410
305
  tf.write(str)
411
306
  end
@@ -417,32 +312,31 @@ module Wp2txt
417
312
  end
418
313
 
419
314
  # modify files under a directry (recursive)
420
- def batch_file_mod(dir_path, &block)
315
+ def batch_file_mod(dir_path)
421
316
  if FileTest.directory?(dir_path)
422
317
  collect_files(dir_path).each do |file|
423
318
  yield file if FileTest.file?(file)
424
319
  end
425
- else
426
- yield dir_path if FileTest.file?(dir_path)
320
+ elsif FileTest.file?(dir_path)
321
+ yield dir_path
427
322
  end
428
323
  end
429
324
 
430
325
  # take care of difference of separators among environments
431
326
  def correct_separator(input)
432
- if input.is_a?(String)
433
- ret_str = String.new
327
+ case input
328
+ when String
434
329
  if RUBY_PLATFORM.index("win32")
435
- ret_str = input.gsub("/", "\\")
330
+ input.gsub("/", "\\")
436
331
  else
437
- ret_str = input.gsub("\\", "/")
332
+ input.gsub("\\", "/")
438
333
  end
439
- return ret_str
440
- elsif input.is_a?(Array)
441
- ret_array = Array.new
334
+ when Array
335
+ ret_array = []
442
336
  input.each do |item|
443
337
  ret_array << correct_separator(item)
444
338
  end
445
- return ret_array
339
+ ret_array
446
340
  end
447
341
  end
448
342
 
@@ -451,17 +345,14 @@ module Wp2txt
451
345
  maxwidth = 0
452
346
 
453
347
  files.each do |f|
454
- width = f.slice(/\-(\d+)\z/, 1).to_s.length.to_i
348
+ width = f.slice(/-(\d+)\z/, 1).to_s.length.to_i
455
349
  maxwidth = width if maxwidth < width
456
- end
457
-
458
- files.each do |f|
459
- newname= f.sub(/\-(\d+)\z/) do
460
- "-" + sprintf("%0#{maxwidth}d", $1.to_i)
350
+ newname = f.sub(/-(\d+)\z/) do
351
+ "-" + format("%0#{maxwidth}d", $1.to_i)
461
352
  end
462
353
  File.rename(f, newname + ".#{ext}")
463
354
  end
464
- return true
355
+ true
465
356
  end
466
357
 
467
358
  # convert int of seconds to string in the format 00:00:00
@@ -473,8 +364,6 @@ module Wp2txt
473
364
  h = int / 3600
474
365
  m = (int - h * 3600) / 60
475
366
  s = int % 60
476
- str = sprintf("%02d:%02d:%02d", h, m, s)
477
- return str
367
+ format("%02d:%02d:%02d", h, m, s)
478
368
  end
479
-
480
369
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Wp2txt
2
- VERSION = "1.0.2"
4
+ VERSION = "1.1.0"
3
5
  end