wp2txt 1.0.2 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +8 -0
- data/.github/workflows/ci.yml +36 -0
- data/.gitignore +1 -1
- data/.rubocop.yml +80 -0
- data/.solargraph.yml +22 -0
- data/Dockerfile +20 -0
- data/Gemfile +9 -2
- data/README.md +26 -3
- data/Rakefile +25 -4
- data/bin/wp2txt +177 -182
- data/lib/wp2txt/article.rb +70 -92
- data/lib/wp2txt/regex.rb +93 -0
- data/lib/wp2txt/utils.rb +159 -270
- data/lib/wp2txt/version.rb +3 -1
- data/lib/wp2txt.rb +119 -150
- data/spec/spec_helper.rb +4 -4
- data/spec/utils_spec.rb +101 -124
- data/wp2txt.gemspec +16 -18
- metadata +60 -12
- data/tags +0 -58
data/lib/wp2txt/utils.rb
CHANGED
@@ -1,183 +1,87 @@
|
|
1
|
-
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
|
4
|
-
require 'strscan'
|
5
|
-
require 'find'
|
6
|
-
require 'htmlentities'
|
7
|
-
|
8
|
-
###################################################
|
9
|
-
# global variables to save resource for generating regexps
|
10
|
-
# those with a trailing number 1 represent opening tag/markup
|
11
|
-
# those with a trailing number 2 represent closing tag/markup
|
12
|
-
# those without a trailing number contain both opening/closing tags/markups
|
13
|
-
|
14
|
-
$html_decoder = HTMLEntities.new
|
15
|
-
|
16
|
-
$entities = [' ', '<', '>', '&', '"'].zip([' ', '<', '>', '&', '"'])
|
17
|
-
$html_hash = Hash[*$entities.flatten]
|
18
|
-
$html_regex = Regexp.new("(" + $html_hash.keys.join("|") + ")")
|
19
|
-
$ml_template_onset_regex = Regexp.new('^\{\{[^\}]*$')
|
20
|
-
$ml_template_end_regex = Regexp.new('\}\}\s*$')
|
21
|
-
$ml_link_onset_regex = Regexp.new('^\[\[[^\]]*$')
|
22
|
-
$ml_linkend_regex = Regexp.new('\]\]\s*$')
|
23
|
-
$isolated_template_regex = Regexp.new('^\s*\{\{.+\}\}\s*$')
|
24
|
-
$isolated_tag_regex = Regexp.new('^\s*\<[^\<\>]+\>.+\<[^\<\>]+\>\s*$')
|
25
|
-
$in_link_regex = Regexp.new('^\s*\[.*\]\s*$')
|
26
|
-
$in_inputbox_regex = Regexp.new('<inputbox>.*?<\/inputbox>')
|
27
|
-
$in_inputbox_regex1 = Regexp.new('<inputbox>')
|
28
|
-
$in_inputbox_regex2 = Regexp.new('<\/inputbox>')
|
29
|
-
$in_source_regex = Regexp.new('<source.*?>.*?<\/source>')
|
30
|
-
$in_source_regex1 = Regexp.new('<source.*?>')
|
31
|
-
$in_source_regex2 = Regexp.new('<\/source>')
|
32
|
-
$in_math_regex = Regexp.new('<math.*?>.*?<\/math>')
|
33
|
-
$in_math_regex1 = Regexp.new('<math.*?>')
|
34
|
-
$in_math_regex2 = Regexp.new('<\/math>')
|
35
|
-
$in_heading_regex = Regexp.new('^=+.*?=+$')
|
36
|
-
$in_html_table_regex = Regexp.new('<table.*?><\/table>')
|
37
|
-
$in_html_table_regex1 = Regexp.new('<table\b')
|
38
|
-
$in_html_table_regex2 = Regexp.new('<\/\s*table>')
|
39
|
-
$in_table_regex1 = Regexp.new('^\s*\{\|')
|
40
|
-
$in_table_regex2 = Regexp.new('^\|\}.*?$')
|
41
|
-
$in_unordered_regex = Regexp.new('^\*')
|
42
|
-
$in_ordered_regex = Regexp.new('^\#')
|
43
|
-
$in_pre_regex = Regexp.new('^ ')
|
44
|
-
$in_definition_regex = Regexp.new('^[\;\:]')
|
45
|
-
$blank_line_regex = Regexp.new('^\s*$')
|
46
|
-
$redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
|
47
|
-
$remove_tag_regex = Regexp.new("\<[^\<\>]*\>")
|
48
|
-
$remove_directives_regex = Regexp.new("\_\_[^\_]*\_\_")
|
49
|
-
$remove_emphasis_regex = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
|
50
|
-
$chrref_to_utf_regex = Regexp.new('&#(x?)([0-9a-fA-F]+);')
|
51
|
-
$mndash_regex = Regexp.new('\{(mdash|ndash|–)\}')
|
52
|
-
$remove_hr_regex = Regexp.new('^\s*\-+\s*$')
|
53
|
-
$make_reference_regex_a = Regexp.new('<br ?\/>')
|
54
|
-
$make_reference_regex_b = Regexp.new('<ref[^>]*\/>')
|
55
|
-
$make_reference_regex_c = Regexp.new('<ref[^>]*>')
|
56
|
-
$make_reference_regex_d = Regexp.new('<\/ref>')
|
57
|
-
$format_ref_regex = Regexp.new('\[ref\](.*?)\[\/ref\]', Regexp::MULTILINE)
|
58
|
-
$heading_onset_regex = Regexp.new('^(\=+)\s+')
|
59
|
-
$heading_coda_regex = Regexp.new('\s+(\=+)$')
|
60
|
-
$list_marks_regex = Regexp.new('\A[\*\#\;\:\ ]+')
|
61
|
-
$pre_marks_regex = Regexp.new('\A\^\ ')
|
62
|
-
$def_marks_regex = Regexp.new('\A[\;\:\ ]+')
|
63
|
-
$onset_bar_regex = Regexp.new('\A[^\|]+\z')
|
64
|
-
|
65
|
-
$category_patterns = ["Category", "Categoria"].join("|")
|
66
|
-
$category_regex = Regexp.new('[\{\[\|\b](?:' + $category_patterns + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
|
67
|
-
|
68
|
-
$escape_nowiki_regex = Regexp.new('<nowiki>(.*?)<\/nowiki>', Regexp::MULTILINE)
|
69
|
-
$unescape_nowiki_regex = Regexp.new('<nowiki\-(\d+?)>')
|
70
|
-
|
71
|
-
$remove_isolated_regex = Regexp.new('^\s*\{\{(.*?)\}\}\s*$')
|
72
|
-
$remove_inline_regex = Regexp.new('\{\{(.*?)\}\}')
|
73
|
-
$type_code_regex = Regexp.new('\A(?:lang*|\AIPA|IEP|SEP|indent|audio|small|dmoz|pron|unicode|note label|nowrap|ArabDIN|trans|Nihongo|Polytonic)', Regexp::IGNORECASE)
|
74
|
-
|
75
|
-
$single_square_bracket_regex = Regexp.new("(#{Regexp.escape('[')}|#{Regexp.escape(']')})", Regexp::MULTILINE)
|
76
|
-
$double_square_bracket_regex = Regexp.new("(#{Regexp.escape('[[')}|#{Regexp.escape(']]')})", Regexp::MULTILINE)
|
77
|
-
$single_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{')}|#{Regexp.escape('}')})", Regexp::MULTILINE)
|
78
|
-
$double_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{{')}|#{Regexp.escape('}}')})", Regexp::MULTILINE)
|
79
|
-
$curly_square_bracket_regex = Regexp.new("(#{Regexp.escape('{|')}|#{Regexp.escape('|}')})", Regexp::MULTILINE)
|
80
|
-
|
81
|
-
$complex_regex_01 = Regexp.new('\<\<([^<>]++)\>\>\s?')
|
82
|
-
$complex_regex_02 = Regexp.new('\[\[File\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]', Regexp::MULTILINE | Regexp::IGNORECASE)
|
83
|
-
$complex_regex_03 = Regexp.new('^\[\[((?:[^\[\]]++|\[\[\g<1>\]\])++)^\]\]', Regexp::MULTILINE)
|
84
|
-
$complex_regex_04 = Regexp.new('\{\{(?:infobox|efn|sfn|unreliable source|refn|reflist|col(?:umns)?\-list|div col|no col|bar box|formatnum\:|col\||see also\||r\||#)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
|
85
|
-
$complex_regex_05 = Regexp.new('\{\{[^{}]+?\n\|((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
|
86
|
-
|
87
|
-
$cleanup_regex_01 = Regexp.new('\[ref\]\s*\[\/ref\]', Regexp::MULTILINE)
|
88
|
-
$cleanup_regex_02 = Regexp.new('^File:.+$')
|
89
|
-
$cleanup_regex_03 = Regexp.new('^\|.*$')
|
90
|
-
$cleanup_regex_04 = Regexp.new('\{\{.*$')
|
91
|
-
$cleanup_regex_05 = Regexp.new('^.*\}\}')
|
92
|
-
$cleanup_regex_06 = Regexp.new('\{\|.*$')
|
93
|
-
$cleanup_regex_07 = Regexp.new('^.*\|\}')
|
94
|
-
$cleanup_regex_08 = Regexp.new('\n\n\n+', Regexp::MULTILINE)
|
95
|
-
|
96
|
-
###################################################
|
1
|
+
# frozen_string_literal: true
|
97
2
|
|
98
|
-
|
3
|
+
require "strscan"
|
4
|
+
require "find"
|
5
|
+
require_relative "regex"
|
99
6
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
f.write text
|
113
|
-
end
|
114
|
-
exit
|
115
|
-
else
|
116
|
-
text.encode!("UTF-16", "UTF-16", invalid: :replace, replace: "")
|
117
|
-
text.encode!("UTF-16", "UTF-16", invalid: :replace, replace: "")
|
118
|
-
convert_characters!(text, true)
|
7
|
+
module Wp2txt
|
8
|
+
def convert_characters(text, has_retried = false)
|
9
|
+
text << ""
|
10
|
+
text = chrref_to_utf(text)
|
11
|
+
text = special_chr(text)
|
12
|
+
text = text.encode("UTF-8", "UTF-8", invalid: :replace, replace: "")
|
13
|
+
rescue StandardError # detect invalid byte sequence in UTF-8
|
14
|
+
if has_retried
|
15
|
+
puts "invalid byte sequence detected"
|
16
|
+
puts "******************************"
|
17
|
+
File.open("error_log.txt", "w") do |f|
|
18
|
+
f.write text
|
119
19
|
end
|
20
|
+
exit
|
21
|
+
else
|
22
|
+
text = text.encode("UTF-16", "UTF-16", invalid: :replace, replace: "")
|
23
|
+
text = text.encode("UTF-16", "UTF-16", invalid: :replace, replace: "")
|
24
|
+
convert_characters(text, true)
|
120
25
|
end
|
121
26
|
end
|
122
27
|
|
123
|
-
def format_wiki
|
124
|
-
remove_complex
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
28
|
+
def format_wiki(text, config = {})
|
29
|
+
text = remove_complex(text)
|
30
|
+
text = escape_nowiki(text)
|
31
|
+
text = process_interwiki_links(text)
|
32
|
+
text = process_external_links(text)
|
33
|
+
text = unescape_nowiki(text)
|
34
|
+
text = remove_directive(text)
|
35
|
+
text = remove_emphasis(text)
|
36
|
+
text = mndash(text)
|
37
|
+
text = remove_hr(text)
|
38
|
+
text = remove_tag(text)
|
39
|
+
text = correct_inline_template(text) unless config[:inline]
|
40
|
+
text = remove_templates(text) unless config[:inline]
|
41
|
+
text = remove_table(text) unless config[:table]
|
42
|
+
text
|
138
43
|
end
|
139
44
|
|
140
|
-
def cleanup
|
141
|
-
text.gsub
|
142
|
-
text.gsub
|
143
|
-
text.gsub
|
144
|
-
text.gsub
|
145
|
-
text.gsub
|
146
|
-
text.gsub
|
147
|
-
text.gsub
|
148
|
-
text.gsub
|
149
|
-
text.strip
|
45
|
+
def cleanup(text)
|
46
|
+
text = text.gsub(CLEANUP_REGEX_01) { "" }
|
47
|
+
text = text.gsub(CLEANUP_REGEX_02) { "" }
|
48
|
+
text = text.gsub(CLEANUP_REGEX_03) { "" }
|
49
|
+
text = text.gsub(CLEANUP_REGEX_04) { "" }
|
50
|
+
text = text.gsub(CLEANUP_REGEX_05) { "" }
|
51
|
+
text = text.gsub(CLEANUP_REGEX_06) { "" }
|
52
|
+
text = text.gsub(CLEANUP_REGEX_07) { "" }
|
53
|
+
text = text.gsub(CLEANUP_REGEX_08) { "\n\n" }
|
54
|
+
text = text.strip
|
150
55
|
text << "\n\n"
|
151
56
|
end
|
152
57
|
|
153
58
|
#################### parser for nested structure ####################
|
154
59
|
|
155
60
|
def process_nested_structure(scanner, left, right, &block)
|
156
|
-
|
157
|
-
buffer = ""
|
61
|
+
buffer = +""
|
158
62
|
begin
|
159
|
-
if left == "[" && right == "]"
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
while str = scanner.scan_until(regex)
|
63
|
+
regex = if left == "[" && right == "]"
|
64
|
+
SINGLE_SQUARE_BRACKET_REGEX
|
65
|
+
elsif left == "[[" && right == "]]"
|
66
|
+
DOUBLE_SQUARE_BRACKET_REGEX
|
67
|
+
elsif left == "{" && right == "}"
|
68
|
+
SINGLE_CURLY_BRACKET_REGEX
|
69
|
+
elsif left == "{{" && right == "}}"
|
70
|
+
DOUBLE_CURLY_BRACKET_REGEX
|
71
|
+
elsif left == "{|" && right == "|}"
|
72
|
+
CURLY_SQUARE_BRACKET_REGEX
|
73
|
+
else
|
74
|
+
Regexp.new("(#{Regexp.escape(left)}|#{Regexp.escape(right)})")
|
75
|
+
end
|
76
|
+
while (str = scanner.scan_until(regex))
|
173
77
|
case scanner[1]
|
174
78
|
when left
|
175
79
|
buffer << str
|
176
80
|
has_left = true
|
177
81
|
when right
|
178
82
|
if has_left
|
179
|
-
buffer = buffer[0...-
|
180
|
-
contents = block.call(str[0...-
|
83
|
+
buffer = buffer[0...-left.size]
|
84
|
+
contents = block.call(str[0...-left.size])
|
181
85
|
buffer << contents
|
182
86
|
break
|
183
87
|
else
|
@@ -187,25 +91,23 @@ module Wp2txt
|
|
187
91
|
end
|
188
92
|
buffer << scanner.rest
|
189
93
|
|
190
|
-
if buffer == scanner.string
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
rescue => e
|
197
|
-
return scanner.string
|
94
|
+
return buffer if buffer == scanner.string
|
95
|
+
|
96
|
+
scanner.string = buffer
|
97
|
+
process_nested_structure(scanner, left, right, &block) || ""
|
98
|
+
rescue StandardError
|
99
|
+
scanner.string
|
198
100
|
end
|
199
101
|
end
|
200
102
|
|
201
103
|
#################### methods used from format_wiki ####################
|
202
|
-
def escape_nowiki
|
104
|
+
def escape_nowiki(str)
|
203
105
|
if @nowikis
|
204
106
|
@nowikis.clear
|
205
107
|
else
|
206
108
|
@nowikis = {}
|
207
109
|
end
|
208
|
-
str.gsub
|
110
|
+
str.gsub(ESCAPE_NOWIKI_REGEX) do
|
209
111
|
nowiki = $1
|
210
112
|
nowiki_id = nowiki.object_id
|
211
113
|
@nowikis[nowiki_id] = nowiki
|
@@ -213,16 +115,16 @@ module Wp2txt
|
|
213
115
|
end
|
214
116
|
end
|
215
117
|
|
216
|
-
def unescape_nowiki
|
217
|
-
str.gsub
|
118
|
+
def unescape_nowiki(str)
|
119
|
+
str.gsub(UNESCAPE_NOWIKI_REGEX) do
|
218
120
|
obj_id = $1.to_i
|
219
121
|
@nowikis[obj_id]
|
220
122
|
end
|
221
123
|
end
|
222
124
|
|
223
|
-
def process_interwiki_links
|
125
|
+
def process_interwiki_links(str)
|
224
126
|
scanner = StringScanner.new(str)
|
225
|
-
|
127
|
+
process_nested_structure(scanner, "[[", "]]") do |contents|
|
226
128
|
parts = contents.split("|")
|
227
129
|
case parts.size
|
228
130
|
when 1
|
@@ -232,12 +134,11 @@ module Wp2txt
|
|
232
134
|
parts.join("|")
|
233
135
|
end
|
234
136
|
end
|
235
|
-
str.replace(result)
|
236
137
|
end
|
237
138
|
|
238
|
-
def process_external_links
|
139
|
+
def process_external_links(str)
|
239
140
|
scanner = StringScanner.new(str)
|
240
|
-
|
141
|
+
process_nested_structure(scanner, "[", "]") do |contents|
|
241
142
|
if /\A\s.+\s\z/ =~ contents
|
242
143
|
" (#{contents.strip}) "
|
243
144
|
else
|
@@ -250,119 +151,115 @@ module Wp2txt
|
|
250
151
|
end
|
251
152
|
end
|
252
153
|
end
|
253
|
-
str.replace(result)
|
254
154
|
end
|
255
155
|
|
256
156
|
#################### methods used from format_article ####################
|
257
157
|
|
258
|
-
def remove_templates
|
259
|
-
|
260
|
-
result = process_nested_structure(
|
158
|
+
def remove_templates(str)
|
159
|
+
scanner1 = StringScanner.new(str)
|
160
|
+
result = process_nested_structure(scanner1, "{{", "}}") do
|
261
161
|
""
|
262
162
|
end
|
263
|
-
|
264
|
-
|
163
|
+
scanner2 = StringScanner.new(result)
|
164
|
+
process_nested_structure(scanner2, "{", "}") do
|
265
165
|
""
|
266
166
|
end
|
267
|
-
str.replace(result)
|
268
167
|
end
|
269
168
|
|
270
|
-
def remove_table
|
169
|
+
def remove_table(str)
|
271
170
|
scanner = StringScanner.new(str)
|
272
|
-
|
171
|
+
process_nested_structure(scanner, "{|", "|}") do
|
273
172
|
""
|
274
173
|
end
|
275
|
-
str.replace(result)
|
276
174
|
end
|
277
175
|
|
278
|
-
def special_chr
|
279
|
-
|
176
|
+
def special_chr(str)
|
177
|
+
HTML_DECODER.decode(str)
|
280
178
|
end
|
281
179
|
|
282
|
-
def remove_inbetween
|
180
|
+
def remove_inbetween(str, tagset = ["<", ">"])
|
283
181
|
tagsets = Regexp.quote(tagset.uniq.join(""))
|
284
182
|
regex = /#{Regexp.escape(tagset[0])}[^#{tagsets}]*#{Regexp.escape(tagset[1])}/
|
285
|
-
str.gsub
|
183
|
+
str.gsub(regex, "")
|
286
184
|
end
|
287
185
|
|
288
|
-
def remove_tag
|
289
|
-
str.gsub
|
186
|
+
def remove_tag(str)
|
187
|
+
str.gsub(REMOVE_TAG_REGEX, "")
|
290
188
|
end
|
291
189
|
|
292
|
-
def remove_directive
|
293
|
-
str.gsub
|
190
|
+
def remove_directive(str)
|
191
|
+
str.gsub(REMOVE_DIRECTIVES_REGEX, "")
|
294
192
|
end
|
295
193
|
|
296
|
-
def remove_emphasis
|
297
|
-
str.gsub
|
194
|
+
def remove_emphasis(str)
|
195
|
+
str.gsub(REMOVE_EMPHASIS_REGEX) do
|
298
196
|
$2
|
299
197
|
end
|
300
198
|
end
|
301
199
|
|
302
|
-
def chrref_to_utf
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
u.encode("UTF-8", "UTF-16")
|
314
|
-
end
|
315
|
-
rescue StandardError
|
316
|
-
return nil
|
200
|
+
def chrref_to_utf(num_str)
|
201
|
+
num_str.gsub(CHRREF_TO_UTF_REGEX) do
|
202
|
+
ch = if $1 == "x"
|
203
|
+
$2.to_i(16)
|
204
|
+
else
|
205
|
+
$2.to_i
|
206
|
+
end
|
207
|
+
hi = ch >> 8
|
208
|
+
lo = ch & 0xff
|
209
|
+
u = +"\377\376" << lo.chr << hi.chr
|
210
|
+
u.encode("UTF-8", "UTF-16")
|
317
211
|
end
|
318
|
-
|
212
|
+
rescue StandardError
|
213
|
+
num_str
|
319
214
|
end
|
320
215
|
|
321
|
-
def mndash
|
322
|
-
str.gsub
|
216
|
+
def mndash(str)
|
217
|
+
str.gsub(MNDASH_REGEX, "–")
|
323
218
|
end
|
324
219
|
|
325
|
-
def remove_hr
|
326
|
-
str.gsub
|
220
|
+
def remove_hr(str)
|
221
|
+
str.gsub(REMOVE_HR_REGEX, "")
|
327
222
|
end
|
328
223
|
|
329
|
-
def remove_ref
|
330
|
-
str.gsub
|
224
|
+
def remove_ref(str)
|
225
|
+
str.gsub(FORMAT_REF_REGEX) { "" }
|
331
226
|
end
|
332
227
|
|
333
|
-
def remove_html
|
334
|
-
str.
|
228
|
+
def remove_html(str)
|
229
|
+
res = +str.dup
|
230
|
+
res.gsub!(%r{<[^<>]+/>}) { "" }
|
335
231
|
["div", "gallery", "timeline", "noinclude"].each do |tag|
|
336
|
-
scanner = StringScanner.new(
|
337
|
-
result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do
|
232
|
+
scanner = StringScanner.new(res)
|
233
|
+
result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do
|
338
234
|
""
|
339
235
|
end
|
340
|
-
|
236
|
+
res.replace(result)
|
341
237
|
end
|
238
|
+
res
|
342
239
|
end
|
343
240
|
|
344
|
-
def remove_complex
|
345
|
-
str.gsub
|
346
|
-
str.gsub
|
347
|
-
str.gsub
|
348
|
-
str.gsub
|
349
|
-
str.gsub
|
241
|
+
def remove_complex(str)
|
242
|
+
str = str.gsub(COMPLEX_REGEX_01) { "《#{$1}》" }
|
243
|
+
str = str.gsub(COMPLEX_REGEX_02) { "" }
|
244
|
+
str = str.gsub(COMPLEX_REGEX_03) { "" }
|
245
|
+
str = str.gsub(COMPLEX_REGEX_04) { "" }
|
246
|
+
str.gsub(COMPLEX_REGEX_05) { "" }
|
350
247
|
end
|
351
248
|
|
352
|
-
def make_reference
|
353
|
-
str.gsub
|
354
|
-
str.gsub
|
355
|
-
str.gsub
|
356
|
-
str.gsub
|
249
|
+
def make_reference(str)
|
250
|
+
str = str.gsub(MAKE_REFERENCE_REGEX_A) { "\n" }
|
251
|
+
str = str.gsub(MAKE_REFERENCE_REGEX_B) { "" }
|
252
|
+
str = str.gsub(MAKE_REFERENCE_REGEX_C) { "[ref]" }
|
253
|
+
str.gsub(MAKE_REFERENCE_REGEX_D) { "[/ref]" }
|
357
254
|
end
|
358
255
|
|
359
|
-
def correct_inline_template
|
256
|
+
def correct_inline_template(str)
|
360
257
|
scanner = StringScanner.new(str)
|
361
|
-
|
258
|
+
process_nested_structure(scanner, "{{", "}}") do |contents|
|
362
259
|
parts = contents.split("|")
|
363
260
|
if /\A(?:lang|fontsize)\z/i =~ parts[0]
|
364
261
|
parts.shift
|
365
|
-
elsif /\Alang
|
262
|
+
elsif /\Alang-/i =~ parts[0]
|
366
263
|
parts.shift
|
367
264
|
elsif /\Alang=/i =~ parts[1]
|
368
265
|
parts.shift
|
@@ -373,27 +270,25 @@ module Wp2txt
|
|
373
270
|
else
|
374
271
|
begin
|
375
272
|
keyval = parts[1].split("=")
|
376
|
-
if keyval.size > 1
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
rescue
|
273
|
+
out = if keyval.size > 1
|
274
|
+
keyval[1]
|
275
|
+
else
|
276
|
+
parts[1] || ""
|
277
|
+
end
|
278
|
+
rescue StandardError
|
382
279
|
out = parts[1] || ""
|
383
280
|
end
|
384
281
|
end
|
385
|
-
|
386
282
|
out.strip
|
387
283
|
end
|
388
|
-
str.replace result
|
389
284
|
end
|
390
285
|
|
391
|
-
#################### file related utilities ####################
|
286
|
+
#################### file related utilities ####################
|
392
287
|
|
393
288
|
# collect filenames recursively
|
394
289
|
def collect_files(str, regex = nil)
|
395
290
|
regex ||= //
|
396
|
-
text_array =
|
291
|
+
text_array = []
|
397
292
|
Find.find(str) do |f|
|
398
293
|
text_array << f if regex =~ f
|
399
294
|
end
|
@@ -401,11 +296,11 @@ module Wp2txt
|
|
401
296
|
end
|
402
297
|
|
403
298
|
# modify a file using block/yield mechanism
|
404
|
-
def file_mod(file_path, backup = false
|
299
|
+
def file_mod(file_path, backup = false)
|
405
300
|
File.open(file_path, "r") do |fr|
|
406
301
|
str = fr.read
|
407
302
|
newstr = yield(str)
|
408
|
-
str = newstr
|
303
|
+
str = newstr if nil? newstr
|
409
304
|
File.open("temp", "w") do |tf|
|
410
305
|
tf.write(str)
|
411
306
|
end
|
@@ -417,32 +312,31 @@ module Wp2txt
|
|
417
312
|
end
|
418
313
|
|
419
314
|
# modify files under a directry (recursive)
|
420
|
-
def batch_file_mod(dir_path
|
315
|
+
def batch_file_mod(dir_path)
|
421
316
|
if FileTest.directory?(dir_path)
|
422
317
|
collect_files(dir_path).each do |file|
|
423
318
|
yield file if FileTest.file?(file)
|
424
319
|
end
|
425
|
-
|
426
|
-
yield dir_path
|
320
|
+
elsif FileTest.file?(dir_path)
|
321
|
+
yield dir_path
|
427
322
|
end
|
428
323
|
end
|
429
324
|
|
430
325
|
# take care of difference of separators among environments
|
431
326
|
def correct_separator(input)
|
432
|
-
|
433
|
-
|
327
|
+
case input
|
328
|
+
when String
|
434
329
|
if RUBY_PLATFORM.index("win32")
|
435
|
-
|
330
|
+
input.gsub("/", "\\")
|
436
331
|
else
|
437
|
-
|
332
|
+
input.gsub("\\", "/")
|
438
333
|
end
|
439
|
-
|
440
|
-
|
441
|
-
ret_array = Array.new
|
334
|
+
when Array
|
335
|
+
ret_array = []
|
442
336
|
input.each do |item|
|
443
337
|
ret_array << correct_separator(item)
|
444
338
|
end
|
445
|
-
|
339
|
+
ret_array
|
446
340
|
end
|
447
341
|
end
|
448
342
|
|
@@ -451,17 +345,14 @@ module Wp2txt
|
|
451
345
|
maxwidth = 0
|
452
346
|
|
453
347
|
files.each do |f|
|
454
|
-
width = f.slice(
|
348
|
+
width = f.slice(/-(\d+)\z/, 1).to_s.length.to_i
|
455
349
|
maxwidth = width if maxwidth < width
|
456
|
-
|
457
|
-
|
458
|
-
files.each do |f|
|
459
|
-
newname= f.sub(/\-(\d+)\z/) do
|
460
|
-
"-" + sprintf("%0#{maxwidth}d", $1.to_i)
|
350
|
+
newname = f.sub(/-(\d+)\z/) do
|
351
|
+
"-" + format("%0#{maxwidth}d", $1.to_i)
|
461
352
|
end
|
462
353
|
File.rename(f, newname + ".#{ext}")
|
463
354
|
end
|
464
|
-
|
355
|
+
true
|
465
356
|
end
|
466
357
|
|
467
358
|
# convert int of seconds to string in the format 00:00:00
|
@@ -473,8 +364,6 @@ module Wp2txt
|
|
473
364
|
h = int / 3600
|
474
365
|
m = (int - h * 3600) / 60
|
475
366
|
s = int % 60
|
476
|
-
|
477
|
-
return str
|
367
|
+
format("%02d:%02d:%02d", h, m, s)
|
478
368
|
end
|
479
|
-
|
480
369
|
end
|
data/lib/wp2txt/version.rb
CHANGED