wp2txt 1.0.1 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.dockerignore +8 -0
- data/.github/workflows/ci.yml +36 -0
- data/.gitignore +1 -1
- data/.rubocop.yml +80 -0
- data/.solargraph.yml +22 -0
- data/Dockerfile +20 -0
- data/Gemfile +9 -2
- data/README.md +42 -13
- data/Rakefile +25 -4
- data/bin/wp2txt +177 -182
- data/lib/wp2txt/article.rb +70 -92
- data/lib/wp2txt/regex.rb +93 -0
- data/lib/wp2txt/utils.rb +172 -282
- data/lib/wp2txt/version.rb +3 -1
- data/lib/wp2txt.rb +119 -150
- data/spec/spec_helper.rb +4 -4
- data/spec/utils_spec.rb +101 -124
- data/wp2txt.gemspec +16 -18
- metadata +60 -11
data/lib/wp2txt/utils.rb
CHANGED
@@ -1,182 +1,87 @@
|
|
1
|
-
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
|
4
|
-
require 'strscan'
|
5
|
-
require 'find'
|
6
|
-
require 'htmlentities'
|
7
|
-
|
8
|
-
###################################################
|
9
|
-
# global variables to save resource for generating regexps
|
10
|
-
# those with a trailing number 1 represent opening tag/markup
|
11
|
-
# those with a trailing number 2 represent closing tag/markup
|
12
|
-
# those without a trailing number contain both opening/closing tags/markups
|
13
|
-
|
14
|
-
$html_decoder = HTMLEntities.new
|
15
|
-
|
16
|
-
$entities = [' ', '<', '>', '&', '"'].zip([' ', '<', '>', '&', '"'])
|
17
|
-
$html_hash = Hash[*$entities.flatten]
|
18
|
-
$html_regex = Regexp.new("(" + $html_hash.keys.join("|") + ")")
|
19
|
-
$ml_template_onset_regex = Regexp.new('^\{\{[^\}]*$')
|
20
|
-
$ml_template_end_regex = Regexp.new('\}\}\s*$')
|
21
|
-
$ml_link_onset_regex = Regexp.new('^\[\[[^\]]*$')
|
22
|
-
$ml_linkend_regex = Regexp.new('\]\]\s*$')
|
23
|
-
$isolated_template_regex = Regexp.new('^\s*\{\{.+\}\}\s*$')
|
24
|
-
$isolated_tag_regex = Regexp.new('^\s*\<[^\<\>]+\>.+\<[^\<\>]+\>\s*$')
|
25
|
-
$in_link_regex = Regexp.new('^\s*\[.*\]\s*$')
|
26
|
-
$in_inputbox_regex = Regexp.new('<inputbox>.*?<\/inputbox>')
|
27
|
-
$in_inputbox_regex1 = Regexp.new('<inputbox>')
|
28
|
-
$in_inputbox_regex2 = Regexp.new('<\/inputbox>')
|
29
|
-
$in_source_regex = Regexp.new('<source.*?>.*?<\/source>')
|
30
|
-
$in_source_regex1 = Regexp.new('<source.*?>')
|
31
|
-
$in_source_regex2 = Regexp.new('<\/source>')
|
32
|
-
$in_math_regex = Regexp.new('<math.*?>.*?<\/math>')
|
33
|
-
$in_math_regex1 = Regexp.new('<math.*?>')
|
34
|
-
$in_math_regex2 = Regexp.new('<\/math>')
|
35
|
-
$in_heading_regex = Regexp.new('^=+.*?=+$')
|
36
|
-
$in_html_table_regex = Regexp.new('<table.*?><\/table>')
|
37
|
-
$in_html_table_regex1 = Regexp.new('<table\b')
|
38
|
-
$in_html_table_regex2 = Regexp.new('<\/\s*table>')
|
39
|
-
$in_table_regex1 = Regexp.new('^\s*\{\|')
|
40
|
-
$in_table_regex2 = Regexp.new('^\|\}.*?$')
|
41
|
-
$in_unordered_regex = Regexp.new('^\*')
|
42
|
-
$in_ordered_regex = Regexp.new('^\#')
|
43
|
-
$in_pre_regex = Regexp.new('^ ')
|
44
|
-
$in_definition_regex = Regexp.new('^[\;\:]')
|
45
|
-
$blank_line_regex = Regexp.new('^\s*$')
|
46
|
-
$redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
|
47
|
-
$remove_tag_regex = Regexp.new("\<[^\<\>]*\>")
|
48
|
-
$remove_directives_regex = Regexp.new("\_\_[^\_]*\_\_")
|
49
|
-
$remove_emphasis_regex = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
|
50
|
-
$chrref_to_utf_regex = Regexp.new('&#(x?)([0-9a-fA-F]+);')
|
51
|
-
$mndash_regex = Regexp.new('\{(mdash|ndash|–)\}')
|
52
|
-
$remove_hr_regex = Regexp.new('^\s*\-+\s*$')
|
53
|
-
$make_reference_regex_a = Regexp.new('<br ?\/>')
|
54
|
-
$make_reference_regex_b = Regexp.new('<ref[^>]*\/>')
|
55
|
-
$make_reference_regex_c = Regexp.new('<ref[^>]*>')
|
56
|
-
$make_reference_regex_d = Regexp.new('<\/ref>')
|
57
|
-
$format_ref_regex = Regexp.new('\[ref\](.*?)\[\/ref\]', Regexp::MULTILINE)
|
58
|
-
$heading_onset_regex = Regexp.new('^(\=+)\s+')
|
59
|
-
$heading_coda_regex = Regexp.new('\s+(\=+)$')
|
60
|
-
$list_marks_regex = Regexp.new('\A[\*\#\;\:\ ]+')
|
61
|
-
$pre_marks_regex = Regexp.new('\A\^\ ')
|
62
|
-
$def_marks_regex = Regexp.new('\A[\;\:\ ]+')
|
63
|
-
$onset_bar_regex = Regexp.new('\A[^\|]+\z')
|
64
|
-
|
65
|
-
$category_patterns = ["Category", "Categoria"].join("|")
|
66
|
-
$category_regex = Regexp.new('[\{\[\|\b](?:' + $category_patterns + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
|
67
|
-
|
68
|
-
$escape_nowiki_regex = Regexp.new('<nowiki>(.*?)<\/nowiki>', Regexp::MULTILINE)
|
69
|
-
$unescape_nowiki_regex = Regexp.new('<nowiki\-(\d+?)>')
|
70
|
-
|
71
|
-
$remove_isolated_regex = Regexp.new('^\s*\{\{(.*?)\}\}\s*$')
|
72
|
-
$remove_inline_regex = Regexp.new('\{\{(.*?)\}\}')
|
73
|
-
$type_code_regex = Regexp.new('\A(?:lang*|\AIPA|IEP|SEP|indent|audio|small|dmoz|pron|unicode|note label|nowrap|ArabDIN|trans|Nihongo|Polytonic)', Regexp::IGNORECASE)
|
74
|
-
|
75
|
-
$single_square_bracket_regex = Regexp.new("(#{Regexp.escape('[')}|#{Regexp.escape(']')})", Regexp::MULTILINE)
|
76
|
-
$double_square_bracket_regex = Regexp.new("(#{Regexp.escape('[[')}|#{Regexp.escape(']]')})", Regexp::MULTILINE)
|
77
|
-
$single_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{')}|#{Regexp.escape('}')})", Regexp::MULTILINE)
|
78
|
-
$double_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{{')}|#{Regexp.escape('}}')})", Regexp::MULTILINE)
|
79
|
-
$curly_square_bracket_regex = Regexp.new("(#{Regexp.escape('{|')}|#{Regexp.escape('|}')})", Regexp::MULTILINE)
|
80
|
-
|
81
|
-
$complex_regex_01 = Regexp.new('\<\<([^<>]++)\>\>\s?')
|
82
|
-
$complex_regex_02 = Regexp.new('\[\[File\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]', Regexp::MULTILINE | Regexp::IGNORECASE)
|
83
|
-
$complex_regex_03 = Regexp.new('^\[\[((?:[^\[\]]++|\[\[\g<1>\]\])++)^\]\]', Regexp::MULTILINE)
|
84
|
-
$complex_regex_04 = Regexp.new('\{\{(?:infobox|efn|sfn|unreliable source|refn|reflist|col(?:umns)?\-list|div col|no col|bar box|formatnum\:|col\||see also\||r\||#)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
|
85
|
-
$complex_regex_05 = Regexp.new('\{\{[^{}]+?\n\|((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
|
86
|
-
|
87
|
-
$cleanup_regex_01 = Regexp.new('\[ref\]\s*\[\/ref\]', Regexp::MULTILINE)
|
88
|
-
$cleanup_regex_02 = Regexp.new('^File:.+$')
|
89
|
-
$cleanup_regex_03 = Regexp.new('^\|.*$')
|
90
|
-
$cleanup_regex_04 = Regexp.new('\{\{.*$')
|
91
|
-
$cleanup_regex_05 = Regexp.new('^.*\}\}')
|
92
|
-
$cleanup_regex_06 = Regexp.new('\{\|.*$')
|
93
|
-
$cleanup_regex_07 = Regexp.new('^.*\|\}')
|
94
|
-
$cleanup_regex_08 = Regexp.new('\n\n\n+', Regexp::MULTILINE)
|
95
|
-
|
96
|
-
###################################################
|
1
|
+
# frozen_string_literal: true
|
97
2
|
|
98
|
-
|
3
|
+
require "strscan"
|
4
|
+
require "find"
|
5
|
+
require_relative "regex"
|
99
6
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
end
|
113
|
-
exit
|
114
|
-
else
|
115
|
-
text.encode!("UTF-16")
|
116
|
-
text.encode!("UTF-8")
|
117
|
-
convert_characters!(text, true)
|
7
|
+
module Wp2txt
|
8
|
+
def convert_characters(text, has_retried = false)
|
9
|
+
text << ""
|
10
|
+
text = chrref_to_utf(text)
|
11
|
+
text = special_chr(text)
|
12
|
+
text = text.encode("UTF-8", "UTF-8", invalid: :replace, replace: "")
|
13
|
+
rescue StandardError # detect invalid byte sequence in UTF-8
|
14
|
+
if has_retried
|
15
|
+
puts "invalid byte sequence detected"
|
16
|
+
puts "******************************"
|
17
|
+
File.open("error_log.txt", "w") do |f|
|
18
|
+
f.write text
|
118
19
|
end
|
20
|
+
exit
|
21
|
+
else
|
22
|
+
text = text.encode("UTF-16", "UTF-16", invalid: :replace, replace: "")
|
23
|
+
text = text.encode("UTF-16", "UTF-16", invalid: :replace, replace: "")
|
24
|
+
convert_characters(text, true)
|
119
25
|
end
|
120
26
|
end
|
121
|
-
|
122
|
-
def format_wiki
|
123
|
-
remove_complex
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
27
|
+
|
28
|
+
def format_wiki(text, config = {})
|
29
|
+
text = remove_complex(text)
|
30
|
+
text = escape_nowiki(text)
|
31
|
+
text = process_interwiki_links(text)
|
32
|
+
text = process_external_links(text)
|
33
|
+
text = unescape_nowiki(text)
|
34
|
+
text = remove_directive(text)
|
35
|
+
text = remove_emphasis(text)
|
36
|
+
text = mndash(text)
|
37
|
+
text = remove_hr(text)
|
38
|
+
text = remove_tag(text)
|
39
|
+
text = correct_inline_template(text) unless config[:inline]
|
40
|
+
text = remove_templates(text) unless config[:inline]
|
41
|
+
text = remove_table(text) unless config[:table]
|
42
|
+
text
|
137
43
|
end
|
138
|
-
|
139
|
-
def cleanup
|
140
|
-
text.gsub
|
141
|
-
text.gsub
|
142
|
-
text.gsub
|
143
|
-
text.gsub
|
144
|
-
text.gsub
|
145
|
-
text.gsub
|
146
|
-
text.gsub
|
147
|
-
text.gsub
|
148
|
-
text.strip
|
44
|
+
|
45
|
+
def cleanup(text)
|
46
|
+
text = text.gsub(CLEANUP_REGEX_01) { "" }
|
47
|
+
text = text.gsub(CLEANUP_REGEX_02) { "" }
|
48
|
+
text = text.gsub(CLEANUP_REGEX_03) { "" }
|
49
|
+
text = text.gsub(CLEANUP_REGEX_04) { "" }
|
50
|
+
text = text.gsub(CLEANUP_REGEX_05) { "" }
|
51
|
+
text = text.gsub(CLEANUP_REGEX_06) { "" }
|
52
|
+
text = text.gsub(CLEANUP_REGEX_07) { "" }
|
53
|
+
text = text.gsub(CLEANUP_REGEX_08) { "\n\n" }
|
54
|
+
text = text.strip
|
149
55
|
text << "\n\n"
|
150
56
|
end
|
151
57
|
|
152
58
|
#################### parser for nested structure ####################
|
153
|
-
|
59
|
+
|
154
60
|
def process_nested_structure(scanner, left, right, &block)
|
155
|
-
|
156
|
-
buffer = ""
|
61
|
+
buffer = +""
|
157
62
|
begin
|
158
|
-
if left == "[" && right == "]"
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
while str = scanner.scan_until(regex)
|
63
|
+
regex = if left == "[" && right == "]"
|
64
|
+
SINGLE_SQUARE_BRACKET_REGEX
|
65
|
+
elsif left == "[[" && right == "]]"
|
66
|
+
DOUBLE_SQUARE_BRACKET_REGEX
|
67
|
+
elsif left == "{" && right == "}"
|
68
|
+
SINGLE_CURLY_BRACKET_REGEX
|
69
|
+
elsif left == "{{" && right == "}}"
|
70
|
+
DOUBLE_CURLY_BRACKET_REGEX
|
71
|
+
elsif left == "{|" && right == "|}"
|
72
|
+
CURLY_SQUARE_BRACKET_REGEX
|
73
|
+
else
|
74
|
+
Regexp.new("(#{Regexp.escape(left)}|#{Regexp.escape(right)})")
|
75
|
+
end
|
76
|
+
while (str = scanner.scan_until(regex))
|
172
77
|
case scanner[1]
|
173
78
|
when left
|
174
79
|
buffer << str
|
175
80
|
has_left = true
|
176
81
|
when right
|
177
82
|
if has_left
|
178
|
-
buffer = buffer[0...-
|
179
|
-
contents = block.call(str[0...-
|
83
|
+
buffer = buffer[0...-left.size]
|
84
|
+
contents = block.call(str[0...-left.size])
|
180
85
|
buffer << contents
|
181
86
|
break
|
182
87
|
else
|
@@ -186,25 +91,23 @@ module Wp2txt
|
|
186
91
|
end
|
187
92
|
buffer << scanner.rest
|
188
93
|
|
189
|
-
if buffer == scanner.string
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
rescue => e
|
196
|
-
return scanner.string
|
94
|
+
return buffer if buffer == scanner.string
|
95
|
+
|
96
|
+
scanner.string = buffer
|
97
|
+
process_nested_structure(scanner, left, right, &block) || ""
|
98
|
+
rescue StandardError
|
99
|
+
scanner.string
|
197
100
|
end
|
198
|
-
end
|
101
|
+
end
|
199
102
|
|
200
103
|
#################### methods used from format_wiki ####################
|
201
|
-
def escape_nowiki
|
104
|
+
def escape_nowiki(str)
|
202
105
|
if @nowikis
|
203
106
|
@nowikis.clear
|
204
107
|
else
|
205
108
|
@nowikis = {}
|
206
109
|
end
|
207
|
-
str.gsub
|
110
|
+
str.gsub(ESCAPE_NOWIKI_REGEX) do
|
208
111
|
nowiki = $1
|
209
112
|
nowiki_id = nowiki.object_id
|
210
113
|
@nowikis[nowiki_id] = nowiki
|
@@ -212,17 +115,17 @@ module Wp2txt
|
|
212
115
|
end
|
213
116
|
end
|
214
117
|
|
215
|
-
def unescape_nowiki
|
216
|
-
str.gsub
|
118
|
+
def unescape_nowiki(str)
|
119
|
+
str.gsub(UNESCAPE_NOWIKI_REGEX) do
|
217
120
|
obj_id = $1.to_i
|
218
121
|
@nowikis[obj_id]
|
219
122
|
end
|
220
123
|
end
|
221
|
-
|
222
|
-
def process_interwiki_links
|
124
|
+
|
125
|
+
def process_interwiki_links(str)
|
223
126
|
scanner = StringScanner.new(str)
|
224
|
-
|
225
|
-
parts = contents.split("|")
|
127
|
+
process_nested_structure(scanner, "[[", "]]") do |contents|
|
128
|
+
parts = contents.split("|")
|
226
129
|
case parts.size
|
227
130
|
when 1
|
228
131
|
parts.first || ""
|
@@ -231,12 +134,11 @@ module Wp2txt
|
|
231
134
|
parts.join("|")
|
232
135
|
end
|
233
136
|
end
|
234
|
-
str.replace(result)
|
235
137
|
end
|
236
138
|
|
237
|
-
def process_external_links
|
139
|
+
def process_external_links(str)
|
238
140
|
scanner = StringScanner.new(str)
|
239
|
-
|
141
|
+
process_nested_structure(scanner, "[", "]") do |contents|
|
240
142
|
if /\A\s.+\s\z/ =~ contents
|
241
143
|
" (#{contents.strip}) "
|
242
144
|
else
|
@@ -249,119 +151,115 @@ module Wp2txt
|
|
249
151
|
end
|
250
152
|
end
|
251
153
|
end
|
252
|
-
str.replace(result)
|
253
154
|
end
|
254
155
|
|
255
156
|
#################### methods used from format_article ####################
|
256
157
|
|
257
|
-
def remove_templates
|
258
|
-
|
259
|
-
result = process_nested_structure(
|
158
|
+
def remove_templates(str)
|
159
|
+
scanner1 = StringScanner.new(str)
|
160
|
+
result = process_nested_structure(scanner1, "{{", "}}") do
|
260
161
|
""
|
261
162
|
end
|
262
|
-
|
263
|
-
|
163
|
+
scanner2 = StringScanner.new(result)
|
164
|
+
process_nested_structure(scanner2, "{", "}") do
|
264
165
|
""
|
265
166
|
end
|
266
|
-
str.replace(result)
|
267
167
|
end
|
268
|
-
|
269
|
-
def remove_table
|
168
|
+
|
169
|
+
def remove_table(str)
|
270
170
|
scanner = StringScanner.new(str)
|
271
|
-
|
171
|
+
process_nested_structure(scanner, "{|", "|}") do
|
272
172
|
""
|
273
173
|
end
|
274
|
-
str.replace(result)
|
275
174
|
end
|
276
|
-
|
277
|
-
def special_chr
|
278
|
-
|
175
|
+
|
176
|
+
def special_chr(str)
|
177
|
+
HTML_DECODER.decode(str)
|
279
178
|
end
|
280
179
|
|
281
|
-
def remove_inbetween
|
180
|
+
def remove_inbetween(str, tagset = ["<", ">"])
|
282
181
|
tagsets = Regexp.quote(tagset.uniq.join(""))
|
283
182
|
regex = /#{Regexp.escape(tagset[0])}[^#{tagsets}]*#{Regexp.escape(tagset[1])}/
|
284
|
-
str.gsub
|
183
|
+
str.gsub(regex, "")
|
285
184
|
end
|
286
185
|
|
287
|
-
def remove_tag
|
288
|
-
str.gsub
|
186
|
+
def remove_tag(str)
|
187
|
+
str.gsub(REMOVE_TAG_REGEX, "")
|
289
188
|
end
|
290
189
|
|
291
|
-
def remove_directive
|
292
|
-
str.gsub
|
190
|
+
def remove_directive(str)
|
191
|
+
str.gsub(REMOVE_DIRECTIVES_REGEX, "")
|
293
192
|
end
|
294
193
|
|
295
|
-
def remove_emphasis
|
296
|
-
str.gsub
|
194
|
+
def remove_emphasis(str)
|
195
|
+
str.gsub(REMOVE_EMPHASIS_REGEX) do
|
297
196
|
$2
|
298
197
|
end
|
299
198
|
end
|
300
199
|
|
301
|
-
def chrref_to_utf
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
u.encode("UTF-8", "UTF-16")
|
313
|
-
end
|
314
|
-
rescue StandardError
|
315
|
-
return nil
|
200
|
+
def chrref_to_utf(num_str)
|
201
|
+
num_str.gsub(CHRREF_TO_UTF_REGEX) do
|
202
|
+
ch = if $1 == "x"
|
203
|
+
$2.to_i(16)
|
204
|
+
else
|
205
|
+
$2.to_i
|
206
|
+
end
|
207
|
+
hi = ch >> 8
|
208
|
+
lo = ch & 0xff
|
209
|
+
u = +"\377\376" << lo.chr << hi.chr
|
210
|
+
u.encode("UTF-8", "UTF-16")
|
316
211
|
end
|
317
|
-
|
212
|
+
rescue StandardError
|
213
|
+
num_str
|
318
214
|
end
|
319
|
-
|
320
|
-
def mndash
|
321
|
-
str.gsub
|
215
|
+
|
216
|
+
def mndash(str)
|
217
|
+
str.gsub(MNDASH_REGEX, "–")
|
322
218
|
end
|
323
219
|
|
324
|
-
def remove_hr
|
325
|
-
str.gsub
|
220
|
+
def remove_hr(str)
|
221
|
+
str.gsub(REMOVE_HR_REGEX, "")
|
326
222
|
end
|
327
223
|
|
328
|
-
def remove_ref
|
329
|
-
str.gsub
|
224
|
+
def remove_ref(str)
|
225
|
+
str.gsub(FORMAT_REF_REGEX) { "" }
|
330
226
|
end
|
331
227
|
|
332
|
-
def remove_html
|
333
|
-
str.
|
228
|
+
def remove_html(str)
|
229
|
+
res = +str.dup
|
230
|
+
res.gsub!(%r{<[^<>]+/>}) { "" }
|
334
231
|
["div", "gallery", "timeline", "noinclude"].each do |tag|
|
335
|
-
scanner = StringScanner.new(
|
336
|
-
result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do
|
232
|
+
scanner = StringScanner.new(res)
|
233
|
+
result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do
|
337
234
|
""
|
338
235
|
end
|
339
|
-
|
236
|
+
res.replace(result)
|
340
237
|
end
|
238
|
+
res
|
341
239
|
end
|
342
240
|
|
343
|
-
def remove_complex
|
344
|
-
str.gsub
|
345
|
-
str.gsub
|
346
|
-
str.gsub
|
347
|
-
str.gsub
|
348
|
-
str.gsub
|
241
|
+
def remove_complex(str)
|
242
|
+
str = str.gsub(COMPLEX_REGEX_01) { "《#{$1}》" }
|
243
|
+
str = str.gsub(COMPLEX_REGEX_02) { "" }
|
244
|
+
str = str.gsub(COMPLEX_REGEX_03) { "" }
|
245
|
+
str = str.gsub(COMPLEX_REGEX_04) { "" }
|
246
|
+
str.gsub(COMPLEX_REGEX_05) { "" }
|
349
247
|
end
|
350
|
-
|
351
|
-
def make_reference
|
352
|
-
str.gsub
|
353
|
-
str.gsub
|
354
|
-
str.gsub
|
355
|
-
str.gsub
|
248
|
+
|
249
|
+
def make_reference(str)
|
250
|
+
str = str.gsub(MAKE_REFERENCE_REGEX_A) { "\n" }
|
251
|
+
str = str.gsub(MAKE_REFERENCE_REGEX_B) { "" }
|
252
|
+
str = str.gsub(MAKE_REFERENCE_REGEX_C) { "[ref]" }
|
253
|
+
str.gsub(MAKE_REFERENCE_REGEX_D) { "[/ref]" }
|
356
254
|
end
|
357
255
|
|
358
|
-
def correct_inline_template
|
256
|
+
def correct_inline_template(str)
|
359
257
|
scanner = StringScanner.new(str)
|
360
|
-
|
258
|
+
process_nested_structure(scanner, "{{", "}}") do |contents|
|
361
259
|
parts = contents.split("|")
|
362
260
|
if /\A(?:lang|fontsize)\z/i =~ parts[0]
|
363
261
|
parts.shift
|
364
|
-
elsif /\Alang
|
262
|
+
elsif /\Alang-/i =~ parts[0]
|
365
263
|
parts.shift
|
366
264
|
elsif /\Alang=/i =~ parts[1]
|
367
265
|
parts.shift
|
@@ -372,27 +270,25 @@ module Wp2txt
|
|
372
270
|
else
|
373
271
|
begin
|
374
272
|
keyval = parts[1].split("=")
|
375
|
-
if keyval.size > 1
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
rescue
|
273
|
+
out = if keyval.size > 1
|
274
|
+
keyval[1]
|
275
|
+
else
|
276
|
+
parts[1] || ""
|
277
|
+
end
|
278
|
+
rescue StandardError
|
381
279
|
out = parts[1] || ""
|
382
280
|
end
|
383
281
|
end
|
384
|
-
|
385
282
|
out.strip
|
386
283
|
end
|
387
|
-
str.replace result
|
388
284
|
end
|
389
285
|
|
390
|
-
#################### file related utilities ####################
|
286
|
+
#################### file related utilities ####################
|
391
287
|
|
392
288
|
# collect filenames recursively
|
393
289
|
def collect_files(str, regex = nil)
|
394
290
|
regex ||= //
|
395
|
-
text_array =
|
291
|
+
text_array = []
|
396
292
|
Find.find(str) do |f|
|
397
293
|
text_array << f if regex =~ f
|
398
294
|
end
|
@@ -400,11 +296,11 @@ module Wp2txt
|
|
400
296
|
end
|
401
297
|
|
402
298
|
# modify a file using block/yield mechanism
|
403
|
-
def file_mod(file_path, backup = false
|
299
|
+
def file_mod(file_path, backup = false)
|
404
300
|
File.open(file_path, "r") do |fr|
|
405
301
|
str = fr.read
|
406
302
|
newstr = yield(str)
|
407
|
-
str = newstr
|
303
|
+
str = newstr if nil? newstr
|
408
304
|
File.open("temp", "w") do |tf|
|
409
305
|
tf.write(str)
|
410
306
|
end
|
@@ -413,54 +309,50 @@ module Wp2txt
|
|
413
309
|
File.rename(file_path, file_path + ".bak")
|
414
310
|
File.rename("temp", file_path)
|
415
311
|
File.unlink(file_path + ".bak") unless backup
|
416
|
-
end
|
312
|
+
end
|
417
313
|
|
418
314
|
# modify files under a directry (recursive)
|
419
|
-
def batch_file_mod(dir_path
|
315
|
+
def batch_file_mod(dir_path)
|
420
316
|
if FileTest.directory?(dir_path)
|
421
317
|
collect_files(dir_path).each do |file|
|
422
318
|
yield file if FileTest.file?(file)
|
423
319
|
end
|
424
|
-
|
425
|
-
yield dir_path
|
320
|
+
elsif FileTest.file?(dir_path)
|
321
|
+
yield dir_path
|
426
322
|
end
|
427
323
|
end
|
428
324
|
|
429
325
|
# take care of difference of separators among environments
|
430
326
|
def correct_separator(input)
|
431
|
-
|
432
|
-
|
327
|
+
case input
|
328
|
+
when String
|
433
329
|
if RUBY_PLATFORM.index("win32")
|
434
|
-
|
330
|
+
input.gsub("/", "\\")
|
435
331
|
else
|
436
|
-
|
332
|
+
input.gsub("\\", "/")
|
437
333
|
end
|
438
|
-
|
439
|
-
|
440
|
-
ret_array = Array.new
|
334
|
+
when Array
|
335
|
+
ret_array = []
|
441
336
|
input.each do |item|
|
442
337
|
ret_array << correct_separator(item)
|
443
338
|
end
|
444
|
-
|
339
|
+
ret_array
|
445
340
|
end
|
446
341
|
end
|
447
342
|
|
448
|
-
def rename(files, ext = "txt")
|
343
|
+
def rename(files, ext = "txt")
|
449
344
|
# num of digits necessary to name the last file generated
|
450
|
-
maxwidth = 0
|
345
|
+
maxwidth = 0
|
451
346
|
|
452
347
|
files.each do |f|
|
453
|
-
width = f.slice(
|
348
|
+
width = f.slice(/-(\d+)\z/, 1).to_s.length.to_i
|
454
349
|
maxwidth = width if maxwidth < width
|
455
|
-
|
456
|
-
|
457
|
-
files.each do |f|
|
458
|
-
newname= f.sub(/\-(\d+)\z/) do
|
459
|
-
"-" + sprintf("%0#{maxwidth}d", $1.to_i)
|
350
|
+
newname = f.sub(/-(\d+)\z/) do
|
351
|
+
"-" + format("%0#{maxwidth}d", $1.to_i)
|
460
352
|
end
|
461
353
|
File.rename(f, newname + ".#{ext}")
|
462
354
|
end
|
463
|
-
|
355
|
+
true
|
464
356
|
end
|
465
357
|
|
466
358
|
# convert int of seconds to string in the format 00:00:00
|
@@ -472,8 +364,6 @@ module Wp2txt
|
|
472
364
|
h = int / 3600
|
473
365
|
m = (int - h * 3600) / 60
|
474
366
|
s = int % 60
|
475
|
-
|
476
|
-
return str
|
367
|
+
format("%02d:%02d:%02d", h, m, s)
|
477
368
|
end
|
478
|
-
|
479
369
|
end
|
data/lib/wp2txt/version.rb
CHANGED