wp2txt 1.0.1 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +8 -0
- data/.github/workflows/ci.yml +36 -0
- data/.gitignore +1 -1
- data/.rubocop.yml +80 -0
- data/.solargraph.yml +22 -0
- data/Dockerfile +20 -0
- data/Gemfile +9 -2
- data/README.md +42 -13
- data/Rakefile +25 -4
- data/bin/wp2txt +177 -182
- data/lib/wp2txt/article.rb +70 -92
- data/lib/wp2txt/regex.rb +93 -0
- data/lib/wp2txt/utils.rb +172 -282
- data/lib/wp2txt/version.rb +3 -1
- data/lib/wp2txt.rb +119 -150
- data/spec/spec_helper.rb +4 -4
- data/spec/utils_spec.rb +101 -124
- data/wp2txt.gemspec +16 -18
- metadata +60 -11
data/lib/wp2txt/utils.rb
CHANGED
@@ -1,182 +1,87 @@
|
|
1
|
-
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
|
4
|
-
require 'strscan'
|
5
|
-
require 'find'
|
6
|
-
require 'htmlentities'
|
7
|
-
|
8
|
-
###################################################
|
9
|
-
# global variables to save resource for generating regexps
|
10
|
-
# those with a trailing number 1 represent opening tag/markup
|
11
|
-
# those with a trailing number 2 represent closing tag/markup
|
12
|
-
# those without a trailing number contain both opening/closing tags/markups
|
13
|
-
|
14
|
-
$html_decoder = HTMLEntities.new
|
15
|
-
|
16
|
-
$entities = [' ', '<', '>', '&', '"'].zip([' ', '<', '>', '&', '"'])
|
17
|
-
$html_hash = Hash[*$entities.flatten]
|
18
|
-
$html_regex = Regexp.new("(" + $html_hash.keys.join("|") + ")")
|
19
|
-
$ml_template_onset_regex = Regexp.new('^\{\{[^\}]*$')
|
20
|
-
$ml_template_end_regex = Regexp.new('\}\}\s*$')
|
21
|
-
$ml_link_onset_regex = Regexp.new('^\[\[[^\]]*$')
|
22
|
-
$ml_linkend_regex = Regexp.new('\]\]\s*$')
|
23
|
-
$isolated_template_regex = Regexp.new('^\s*\{\{.+\}\}\s*$')
|
24
|
-
$isolated_tag_regex = Regexp.new('^\s*\<[^\<\>]+\>.+\<[^\<\>]+\>\s*$')
|
25
|
-
$in_link_regex = Regexp.new('^\s*\[.*\]\s*$')
|
26
|
-
$in_inputbox_regex = Regexp.new('<inputbox>.*?<\/inputbox>')
|
27
|
-
$in_inputbox_regex1 = Regexp.new('<inputbox>')
|
28
|
-
$in_inputbox_regex2 = Regexp.new('<\/inputbox>')
|
29
|
-
$in_source_regex = Regexp.new('<source.*?>.*?<\/source>')
|
30
|
-
$in_source_regex1 = Regexp.new('<source.*?>')
|
31
|
-
$in_source_regex2 = Regexp.new('<\/source>')
|
32
|
-
$in_math_regex = Regexp.new('<math.*?>.*?<\/math>')
|
33
|
-
$in_math_regex1 = Regexp.new('<math.*?>')
|
34
|
-
$in_math_regex2 = Regexp.new('<\/math>')
|
35
|
-
$in_heading_regex = Regexp.new('^=+.*?=+$')
|
36
|
-
$in_html_table_regex = Regexp.new('<table.*?><\/table>')
|
37
|
-
$in_html_table_regex1 = Regexp.new('<table\b')
|
38
|
-
$in_html_table_regex2 = Regexp.new('<\/\s*table>')
|
39
|
-
$in_table_regex1 = Regexp.new('^\s*\{\|')
|
40
|
-
$in_table_regex2 = Regexp.new('^\|\}.*?$')
|
41
|
-
$in_unordered_regex = Regexp.new('^\*')
|
42
|
-
$in_ordered_regex = Regexp.new('^\#')
|
43
|
-
$in_pre_regex = Regexp.new('^ ')
|
44
|
-
$in_definition_regex = Regexp.new('^[\;\:]')
|
45
|
-
$blank_line_regex = Regexp.new('^\s*$')
|
46
|
-
$redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
|
47
|
-
$remove_tag_regex = Regexp.new("\<[^\<\>]*\>")
|
48
|
-
$remove_directives_regex = Regexp.new("\_\_[^\_]*\_\_")
|
49
|
-
$remove_emphasis_regex = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
|
50
|
-
$chrref_to_utf_regex = Regexp.new('&#(x?)([0-9a-fA-F]+);')
|
51
|
-
$mndash_regex = Regexp.new('\{(mdash|ndash|–)\}')
|
52
|
-
$remove_hr_regex = Regexp.new('^\s*\-+\s*$')
|
53
|
-
$make_reference_regex_a = Regexp.new('<br ?\/>')
|
54
|
-
$make_reference_regex_b = Regexp.new('<ref[^>]*\/>')
|
55
|
-
$make_reference_regex_c = Regexp.new('<ref[^>]*>')
|
56
|
-
$make_reference_regex_d = Regexp.new('<\/ref>')
|
57
|
-
$format_ref_regex = Regexp.new('\[ref\](.*?)\[\/ref\]', Regexp::MULTILINE)
|
58
|
-
$heading_onset_regex = Regexp.new('^(\=+)\s+')
|
59
|
-
$heading_coda_regex = Regexp.new('\s+(\=+)$')
|
60
|
-
$list_marks_regex = Regexp.new('\A[\*\#\;\:\ ]+')
|
61
|
-
$pre_marks_regex = Regexp.new('\A\^\ ')
|
62
|
-
$def_marks_regex = Regexp.new('\A[\;\:\ ]+')
|
63
|
-
$onset_bar_regex = Regexp.new('\A[^\|]+\z')
|
64
|
-
|
65
|
-
$category_patterns = ["Category", "Categoria"].join("|")
|
66
|
-
$category_regex = Regexp.new('[\{\[\|\b](?:' + $category_patterns + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
|
67
|
-
|
68
|
-
$escape_nowiki_regex = Regexp.new('<nowiki>(.*?)<\/nowiki>', Regexp::MULTILINE)
|
69
|
-
$unescape_nowiki_regex = Regexp.new('<nowiki\-(\d+?)>')
|
70
|
-
|
71
|
-
$remove_isolated_regex = Regexp.new('^\s*\{\{(.*?)\}\}\s*$')
|
72
|
-
$remove_inline_regex = Regexp.new('\{\{(.*?)\}\}')
|
73
|
-
$type_code_regex = Regexp.new('\A(?:lang*|\AIPA|IEP|SEP|indent|audio|small|dmoz|pron|unicode|note label|nowrap|ArabDIN|trans|Nihongo|Polytonic)', Regexp::IGNORECASE)
|
74
|
-
|
75
|
-
$single_square_bracket_regex = Regexp.new("(#{Regexp.escape('[')}|#{Regexp.escape(']')})", Regexp::MULTILINE)
|
76
|
-
$double_square_bracket_regex = Regexp.new("(#{Regexp.escape('[[')}|#{Regexp.escape(']]')})", Regexp::MULTILINE)
|
77
|
-
$single_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{')}|#{Regexp.escape('}')})", Regexp::MULTILINE)
|
78
|
-
$double_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{{')}|#{Regexp.escape('}}')})", Regexp::MULTILINE)
|
79
|
-
$curly_square_bracket_regex = Regexp.new("(#{Regexp.escape('{|')}|#{Regexp.escape('|}')})", Regexp::MULTILINE)
|
80
|
-
|
81
|
-
$complex_regex_01 = Regexp.new('\<\<([^<>]++)\>\>\s?')
|
82
|
-
$complex_regex_02 = Regexp.new('\[\[File\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]', Regexp::MULTILINE | Regexp::IGNORECASE)
|
83
|
-
$complex_regex_03 = Regexp.new('^\[\[((?:[^\[\]]++|\[\[\g<1>\]\])++)^\]\]', Regexp::MULTILINE)
|
84
|
-
$complex_regex_04 = Regexp.new('\{\{(?:infobox|efn|sfn|unreliable source|refn|reflist|col(?:umns)?\-list|div col|no col|bar box|formatnum\:|col\||see also\||r\||#)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
|
85
|
-
$complex_regex_05 = Regexp.new('\{\{[^{}]+?\n\|((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
|
86
|
-
|
87
|
-
$cleanup_regex_01 = Regexp.new('\[ref\]\s*\[\/ref\]', Regexp::MULTILINE)
|
88
|
-
$cleanup_regex_02 = Regexp.new('^File:.+$')
|
89
|
-
$cleanup_regex_03 = Regexp.new('^\|.*$')
|
90
|
-
$cleanup_regex_04 = Regexp.new('\{\{.*$')
|
91
|
-
$cleanup_regex_05 = Regexp.new('^.*\}\}')
|
92
|
-
$cleanup_regex_06 = Regexp.new('\{\|.*$')
|
93
|
-
$cleanup_regex_07 = Regexp.new('^.*\|\}')
|
94
|
-
$cleanup_regex_08 = Regexp.new('\n\n\n+', Regexp::MULTILINE)
|
95
|
-
|
96
|
-
###################################################
|
1
|
+
# frozen_string_literal: true
|
97
2
|
|
98
|
-
|
3
|
+
require "strscan"
|
4
|
+
require "find"
|
5
|
+
require_relative "regex"
|
99
6
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
end
|
113
|
-
exit
|
114
|
-
else
|
115
|
-
text.encode!("UTF-16")
|
116
|
-
text.encode!("UTF-8")
|
117
|
-
convert_characters!(text, true)
|
7
|
+
module Wp2txt
|
8
|
+
def convert_characters(text, has_retried = false)
|
9
|
+
text << ""
|
10
|
+
text = chrref_to_utf(text)
|
11
|
+
text = special_chr(text)
|
12
|
+
text = text.encode("UTF-8", "UTF-8", invalid: :replace, replace: "")
|
13
|
+
rescue StandardError # detect invalid byte sequence in UTF-8
|
14
|
+
if has_retried
|
15
|
+
puts "invalid byte sequence detected"
|
16
|
+
puts "******************************"
|
17
|
+
File.open("error_log.txt", "w") do |f|
|
18
|
+
f.write text
|
118
19
|
end
|
20
|
+
exit
|
21
|
+
else
|
22
|
+
text = text.encode("UTF-16", "UTF-16", invalid: :replace, replace: "")
|
23
|
+
text = text.encode("UTF-16", "UTF-16", invalid: :replace, replace: "")
|
24
|
+
convert_characters(text, true)
|
119
25
|
end
|
120
26
|
end
|
121
|
-
|
122
|
-
def format_wiki
|
123
|
-
remove_complex
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
27
|
+
|
28
|
+
def format_wiki(text, config = {})
|
29
|
+
text = remove_complex(text)
|
30
|
+
text = escape_nowiki(text)
|
31
|
+
text = process_interwiki_links(text)
|
32
|
+
text = process_external_links(text)
|
33
|
+
text = unescape_nowiki(text)
|
34
|
+
text = remove_directive(text)
|
35
|
+
text = remove_emphasis(text)
|
36
|
+
text = mndash(text)
|
37
|
+
text = remove_hr(text)
|
38
|
+
text = remove_tag(text)
|
39
|
+
text = correct_inline_template(text) unless config[:inline]
|
40
|
+
text = remove_templates(text) unless config[:inline]
|
41
|
+
text = remove_table(text) unless config[:table]
|
42
|
+
text
|
137
43
|
end
|
138
|
-
|
139
|
-
def cleanup
|
140
|
-
text.gsub
|
141
|
-
text.gsub
|
142
|
-
text.gsub
|
143
|
-
text.gsub
|
144
|
-
text.gsub
|
145
|
-
text.gsub
|
146
|
-
text.gsub
|
147
|
-
text.gsub
|
148
|
-
text.strip
|
44
|
+
|
45
|
+
def cleanup(text)
|
46
|
+
text = text.gsub(CLEANUP_REGEX_01) { "" }
|
47
|
+
text = text.gsub(CLEANUP_REGEX_02) { "" }
|
48
|
+
text = text.gsub(CLEANUP_REGEX_03) { "" }
|
49
|
+
text = text.gsub(CLEANUP_REGEX_04) { "" }
|
50
|
+
text = text.gsub(CLEANUP_REGEX_05) { "" }
|
51
|
+
text = text.gsub(CLEANUP_REGEX_06) { "" }
|
52
|
+
text = text.gsub(CLEANUP_REGEX_07) { "" }
|
53
|
+
text = text.gsub(CLEANUP_REGEX_08) { "\n\n" }
|
54
|
+
text = text.strip
|
149
55
|
text << "\n\n"
|
150
56
|
end
|
151
57
|
|
152
58
|
#################### parser for nested structure ####################
|
153
|
-
|
59
|
+
|
154
60
|
def process_nested_structure(scanner, left, right, &block)
|
155
|
-
|
156
|
-
buffer = ""
|
61
|
+
buffer = +""
|
157
62
|
begin
|
158
|
-
if left == "[" && right == "]"
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
while str = scanner.scan_until(regex)
|
63
|
+
regex = if left == "[" && right == "]"
|
64
|
+
SINGLE_SQUARE_BRACKET_REGEX
|
65
|
+
elsif left == "[[" && right == "]]"
|
66
|
+
DOUBLE_SQUARE_BRACKET_REGEX
|
67
|
+
elsif left == "{" && right == "}"
|
68
|
+
SINGLE_CURLY_BRACKET_REGEX
|
69
|
+
elsif left == "{{" && right == "}}"
|
70
|
+
DOUBLE_CURLY_BRACKET_REGEX
|
71
|
+
elsif left == "{|" && right == "|}"
|
72
|
+
CURLY_SQUARE_BRACKET_REGEX
|
73
|
+
else
|
74
|
+
Regexp.new("(#{Regexp.escape(left)}|#{Regexp.escape(right)})")
|
75
|
+
end
|
76
|
+
while (str = scanner.scan_until(regex))
|
172
77
|
case scanner[1]
|
173
78
|
when left
|
174
79
|
buffer << str
|
175
80
|
has_left = true
|
176
81
|
when right
|
177
82
|
if has_left
|
178
|
-
buffer = buffer[0...-
|
179
|
-
contents = block.call(str[0...-
|
83
|
+
buffer = buffer[0...-left.size]
|
84
|
+
contents = block.call(str[0...-left.size])
|
180
85
|
buffer << contents
|
181
86
|
break
|
182
87
|
else
|
@@ -186,25 +91,23 @@ module Wp2txt
|
|
186
91
|
end
|
187
92
|
buffer << scanner.rest
|
188
93
|
|
189
|
-
if buffer == scanner.string
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
rescue => e
|
196
|
-
return scanner.string
|
94
|
+
return buffer if buffer == scanner.string
|
95
|
+
|
96
|
+
scanner.string = buffer
|
97
|
+
process_nested_structure(scanner, left, right, &block) || ""
|
98
|
+
rescue StandardError
|
99
|
+
scanner.string
|
197
100
|
end
|
198
|
-
end
|
101
|
+
end
|
199
102
|
|
200
103
|
#################### methods used from format_wiki ####################
|
201
|
-
def escape_nowiki
|
104
|
+
def escape_nowiki(str)
|
202
105
|
if @nowikis
|
203
106
|
@nowikis.clear
|
204
107
|
else
|
205
108
|
@nowikis = {}
|
206
109
|
end
|
207
|
-
str.gsub
|
110
|
+
str.gsub(ESCAPE_NOWIKI_REGEX) do
|
208
111
|
nowiki = $1
|
209
112
|
nowiki_id = nowiki.object_id
|
210
113
|
@nowikis[nowiki_id] = nowiki
|
@@ -212,17 +115,17 @@ module Wp2txt
|
|
212
115
|
end
|
213
116
|
end
|
214
117
|
|
215
|
-
def unescape_nowiki
|
216
|
-
str.gsub
|
118
|
+
def unescape_nowiki(str)
|
119
|
+
str.gsub(UNESCAPE_NOWIKI_REGEX) do
|
217
120
|
obj_id = $1.to_i
|
218
121
|
@nowikis[obj_id]
|
219
122
|
end
|
220
123
|
end
|
221
|
-
|
222
|
-
def process_interwiki_links
|
124
|
+
|
125
|
+
def process_interwiki_links(str)
|
223
126
|
scanner = StringScanner.new(str)
|
224
|
-
|
225
|
-
parts = contents.split("|")
|
127
|
+
process_nested_structure(scanner, "[[", "]]") do |contents|
|
128
|
+
parts = contents.split("|")
|
226
129
|
case parts.size
|
227
130
|
when 1
|
228
131
|
parts.first || ""
|
@@ -231,12 +134,11 @@ module Wp2txt
|
|
231
134
|
parts.join("|")
|
232
135
|
end
|
233
136
|
end
|
234
|
-
str.replace(result)
|
235
137
|
end
|
236
138
|
|
237
|
-
def process_external_links
|
139
|
+
def process_external_links(str)
|
238
140
|
scanner = StringScanner.new(str)
|
239
|
-
|
141
|
+
process_nested_structure(scanner, "[", "]") do |contents|
|
240
142
|
if /\A\s.+\s\z/ =~ contents
|
241
143
|
" (#{contents.strip}) "
|
242
144
|
else
|
@@ -249,119 +151,115 @@ module Wp2txt
|
|
249
151
|
end
|
250
152
|
end
|
251
153
|
end
|
252
|
-
str.replace(result)
|
253
154
|
end
|
254
155
|
|
255
156
|
#################### methods used from format_article ####################
|
256
157
|
|
257
|
-
def remove_templates
|
258
|
-
|
259
|
-
result = process_nested_structure(
|
158
|
+
def remove_templates(str)
|
159
|
+
scanner1 = StringScanner.new(str)
|
160
|
+
result = process_nested_structure(scanner1, "{{", "}}") do
|
260
161
|
""
|
261
162
|
end
|
262
|
-
|
263
|
-
|
163
|
+
scanner2 = StringScanner.new(result)
|
164
|
+
process_nested_structure(scanner2, "{", "}") do
|
264
165
|
""
|
265
166
|
end
|
266
|
-
str.replace(result)
|
267
167
|
end
|
268
|
-
|
269
|
-
def remove_table
|
168
|
+
|
169
|
+
def remove_table(str)
|
270
170
|
scanner = StringScanner.new(str)
|
271
|
-
|
171
|
+
process_nested_structure(scanner, "{|", "|}") do
|
272
172
|
""
|
273
173
|
end
|
274
|
-
str.replace(result)
|
275
174
|
end
|
276
|
-
|
277
|
-
def special_chr
|
278
|
-
|
175
|
+
|
176
|
+
def special_chr(str)
|
177
|
+
HTML_DECODER.decode(str)
|
279
178
|
end
|
280
179
|
|
281
|
-
def remove_inbetween
|
180
|
+
def remove_inbetween(str, tagset = ["<", ">"])
|
282
181
|
tagsets = Regexp.quote(tagset.uniq.join(""))
|
283
182
|
regex = /#{Regexp.escape(tagset[0])}[^#{tagsets}]*#{Regexp.escape(tagset[1])}/
|
284
|
-
str.gsub
|
183
|
+
str.gsub(regex, "")
|
285
184
|
end
|
286
185
|
|
287
|
-
def remove_tag
|
288
|
-
str.gsub
|
186
|
+
def remove_tag(str)
|
187
|
+
str.gsub(REMOVE_TAG_REGEX, "")
|
289
188
|
end
|
290
189
|
|
291
|
-
def remove_directive
|
292
|
-
str.gsub
|
190
|
+
def remove_directive(str)
|
191
|
+
str.gsub(REMOVE_DIRECTIVES_REGEX, "")
|
293
192
|
end
|
294
193
|
|
295
|
-
def remove_emphasis
|
296
|
-
str.gsub
|
194
|
+
def remove_emphasis(str)
|
195
|
+
str.gsub(REMOVE_EMPHASIS_REGEX) do
|
297
196
|
$2
|
298
197
|
end
|
299
198
|
end
|
300
199
|
|
301
|
-
def chrref_to_utf
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
u.encode("UTF-8", "UTF-16")
|
313
|
-
end
|
314
|
-
rescue StandardError
|
315
|
-
return nil
|
200
|
+
def chrref_to_utf(num_str)
|
201
|
+
num_str.gsub(CHRREF_TO_UTF_REGEX) do
|
202
|
+
ch = if $1 == "x"
|
203
|
+
$2.to_i(16)
|
204
|
+
else
|
205
|
+
$2.to_i
|
206
|
+
end
|
207
|
+
hi = ch >> 8
|
208
|
+
lo = ch & 0xff
|
209
|
+
u = +"\377\376" << lo.chr << hi.chr
|
210
|
+
u.encode("UTF-8", "UTF-16")
|
316
211
|
end
|
317
|
-
|
212
|
+
rescue StandardError
|
213
|
+
num_str
|
318
214
|
end
|
319
|
-
|
320
|
-
def mndash
|
321
|
-
str.gsub
|
215
|
+
|
216
|
+
def mndash(str)
|
217
|
+
str.gsub(MNDASH_REGEX, "–")
|
322
218
|
end
|
323
219
|
|
324
|
-
def remove_hr
|
325
|
-
str.gsub
|
220
|
+
def remove_hr(str)
|
221
|
+
str.gsub(REMOVE_HR_REGEX, "")
|
326
222
|
end
|
327
223
|
|
328
|
-
def remove_ref
|
329
|
-
str.gsub
|
224
|
+
def remove_ref(str)
|
225
|
+
str.gsub(FORMAT_REF_REGEX) { "" }
|
330
226
|
end
|
331
227
|
|
332
|
-
def remove_html
|
333
|
-
str.
|
228
|
+
def remove_html(str)
|
229
|
+
res = +str.dup
|
230
|
+
res.gsub!(%r{<[^<>]+/>}) { "" }
|
334
231
|
["div", "gallery", "timeline", "noinclude"].each do |tag|
|
335
|
-
scanner = StringScanner.new(
|
336
|
-
result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do
|
232
|
+
scanner = StringScanner.new(res)
|
233
|
+
result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do
|
337
234
|
""
|
338
235
|
end
|
339
|
-
|
236
|
+
res.replace(result)
|
340
237
|
end
|
238
|
+
res
|
341
239
|
end
|
342
240
|
|
343
|
-
def remove_complex
|
344
|
-
str.gsub
|
345
|
-
str.gsub
|
346
|
-
str.gsub
|
347
|
-
str.gsub
|
348
|
-
str.gsub
|
241
|
+
def remove_complex(str)
|
242
|
+
str = str.gsub(COMPLEX_REGEX_01) { "《#{$1}》" }
|
243
|
+
str = str.gsub(COMPLEX_REGEX_02) { "" }
|
244
|
+
str = str.gsub(COMPLEX_REGEX_03) { "" }
|
245
|
+
str = str.gsub(COMPLEX_REGEX_04) { "" }
|
246
|
+
str.gsub(COMPLEX_REGEX_05) { "" }
|
349
247
|
end
|
350
|
-
|
351
|
-
def make_reference
|
352
|
-
str.gsub
|
353
|
-
str.gsub
|
354
|
-
str.gsub
|
355
|
-
str.gsub
|
248
|
+
|
249
|
+
def make_reference(str)
|
250
|
+
str = str.gsub(MAKE_REFERENCE_REGEX_A) { "\n" }
|
251
|
+
str = str.gsub(MAKE_REFERENCE_REGEX_B) { "" }
|
252
|
+
str = str.gsub(MAKE_REFERENCE_REGEX_C) { "[ref]" }
|
253
|
+
str.gsub(MAKE_REFERENCE_REGEX_D) { "[/ref]" }
|
356
254
|
end
|
357
255
|
|
358
|
-
def correct_inline_template
|
256
|
+
def correct_inline_template(str)
|
359
257
|
scanner = StringScanner.new(str)
|
360
|
-
|
258
|
+
process_nested_structure(scanner, "{{", "}}") do |contents|
|
361
259
|
parts = contents.split("|")
|
362
260
|
if /\A(?:lang|fontsize)\z/i =~ parts[0]
|
363
261
|
parts.shift
|
364
|
-
elsif /\Alang
|
262
|
+
elsif /\Alang-/i =~ parts[0]
|
365
263
|
parts.shift
|
366
264
|
elsif /\Alang=/i =~ parts[1]
|
367
265
|
parts.shift
|
@@ -372,27 +270,25 @@ module Wp2txt
|
|
372
270
|
else
|
373
271
|
begin
|
374
272
|
keyval = parts[1].split("=")
|
375
|
-
if keyval.size > 1
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
rescue
|
273
|
+
out = if keyval.size > 1
|
274
|
+
keyval[1]
|
275
|
+
else
|
276
|
+
parts[1] || ""
|
277
|
+
end
|
278
|
+
rescue StandardError
|
381
279
|
out = parts[1] || ""
|
382
280
|
end
|
383
281
|
end
|
384
|
-
|
385
282
|
out.strip
|
386
283
|
end
|
387
|
-
str.replace result
|
388
284
|
end
|
389
285
|
|
390
|
-
#################### file related utilities ####################
|
286
|
+
#################### file related utilities ####################
|
391
287
|
|
392
288
|
# collect filenames recursively
|
393
289
|
def collect_files(str, regex = nil)
|
394
290
|
regex ||= //
|
395
|
-
text_array =
|
291
|
+
text_array = []
|
396
292
|
Find.find(str) do |f|
|
397
293
|
text_array << f if regex =~ f
|
398
294
|
end
|
@@ -400,11 +296,11 @@ module Wp2txt
|
|
400
296
|
end
|
401
297
|
|
402
298
|
# modify a file using block/yield mechanism
|
403
|
-
def file_mod(file_path, backup = false
|
299
|
+
def file_mod(file_path, backup = false)
|
404
300
|
File.open(file_path, "r") do |fr|
|
405
301
|
str = fr.read
|
406
302
|
newstr = yield(str)
|
407
|
-
str = newstr
|
303
|
+
str = newstr if nil? newstr
|
408
304
|
File.open("temp", "w") do |tf|
|
409
305
|
tf.write(str)
|
410
306
|
end
|
@@ -413,54 +309,50 @@ module Wp2txt
|
|
413
309
|
File.rename(file_path, file_path + ".bak")
|
414
310
|
File.rename("temp", file_path)
|
415
311
|
File.unlink(file_path + ".bak") unless backup
|
416
|
-
end
|
312
|
+
end
|
417
313
|
|
418
314
|
# modify files under a directry (recursive)
|
419
|
-
def batch_file_mod(dir_path
|
315
|
+
def batch_file_mod(dir_path)
|
420
316
|
if FileTest.directory?(dir_path)
|
421
317
|
collect_files(dir_path).each do |file|
|
422
318
|
yield file if FileTest.file?(file)
|
423
319
|
end
|
424
|
-
|
425
|
-
yield dir_path
|
320
|
+
elsif FileTest.file?(dir_path)
|
321
|
+
yield dir_path
|
426
322
|
end
|
427
323
|
end
|
428
324
|
|
429
325
|
# take care of difference of separators among environments
|
430
326
|
def correct_separator(input)
|
431
|
-
|
432
|
-
|
327
|
+
case input
|
328
|
+
when String
|
433
329
|
if RUBY_PLATFORM.index("win32")
|
434
|
-
|
330
|
+
input.gsub("/", "\\")
|
435
331
|
else
|
436
|
-
|
332
|
+
input.gsub("\\", "/")
|
437
333
|
end
|
438
|
-
|
439
|
-
|
440
|
-
ret_array = Array.new
|
334
|
+
when Array
|
335
|
+
ret_array = []
|
441
336
|
input.each do |item|
|
442
337
|
ret_array << correct_separator(item)
|
443
338
|
end
|
444
|
-
|
339
|
+
ret_array
|
445
340
|
end
|
446
341
|
end
|
447
342
|
|
448
|
-
def rename(files, ext = "txt")
|
343
|
+
def rename(files, ext = "txt")
|
449
344
|
# num of digits necessary to name the last file generated
|
450
|
-
maxwidth = 0
|
345
|
+
maxwidth = 0
|
451
346
|
|
452
347
|
files.each do |f|
|
453
|
-
width = f.slice(
|
348
|
+
width = f.slice(/-(\d+)\z/, 1).to_s.length.to_i
|
454
349
|
maxwidth = width if maxwidth < width
|
455
|
-
|
456
|
-
|
457
|
-
files.each do |f|
|
458
|
-
newname= f.sub(/\-(\d+)\z/) do
|
459
|
-
"-" + sprintf("%0#{maxwidth}d", $1.to_i)
|
350
|
+
newname = f.sub(/-(\d+)\z/) do
|
351
|
+
"-" + format("%0#{maxwidth}d", $1.to_i)
|
460
352
|
end
|
461
353
|
File.rename(f, newname + ".#{ext}")
|
462
354
|
end
|
463
|
-
|
355
|
+
true
|
464
356
|
end
|
465
357
|
|
466
358
|
# convert int of seconds to string in the format 00:00:00
|
@@ -472,8 +364,6 @@ module Wp2txt
|
|
472
364
|
h = int / 3600
|
473
365
|
m = (int - h * 3600) / 60
|
474
366
|
s = int % 60
|
475
|
-
|
476
|
-
return str
|
367
|
+
format("%02d:%02d:%02d", h, m, s)
|
477
368
|
end
|
478
|
-
|
479
369
|
end
|
data/lib/wp2txt/version.rb
CHANGED