wp2txt 0.7.5 → 0.7.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/wp2txt +4 -3
- data/lib/wp2txt/utils.rb +1 -40
- data/lib/wp2txt/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b4dff862e52128851fb4db35a59a1f4e3e9e473f
|
4
|
+
data.tar.gz: e38612ada2785a2fed1e975460451b0d0a703e5e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 19159a3706d0dfb687c239f3200ac3685916e63a4e3ec292d3e47acf4e8787c5ed728837bdf4b5586691adacdc75decddc1b2c7937ad483ca15a3663213d32c9
|
7
|
+
data.tar.gz: 8d956611e29eb4fa0d058ab6459bd96167352fdb5ee44b1819fe4688983a8ebf146cc2413e609029558d845bf9e93f9e27eaca9a35f34bf4644c234a4809146c
|
data/bin/wp2txt
CHANGED
@@ -127,8 +127,9 @@ wpconv.extract_text do |article|
|
|
127
127
|
if /\A\s*\z/m =~ contents
|
128
128
|
result = ""
|
129
129
|
else
|
130
|
-
result = config[:title] ? title + "\n"
|
130
|
+
result = config[:title] ? title + "\n" << contents : contents
|
131
131
|
end
|
132
|
-
result
|
133
|
-
result
|
132
|
+
result.gsub!(/\[ref\]\s*\[\/ref\]/m){""}
|
133
|
+
result.gsub!(/\n\n\n+/m){"\n\n"}
|
134
|
+
result << "\n"
|
134
135
|
end
|
data/lib/wp2txt/utils.rb
CHANGED
@@ -68,8 +68,6 @@ $list_marks_regex = Regexp.new('\A[\*\#\;\:\ ]+')
|
|
68
68
|
$pre_marks_regex = Regexp.new('\A\^\ ')
|
69
69
|
$def_marks_regex = Regexp.new('\A[\;\:\ ]+')
|
70
70
|
$onset_bar_regex = Regexp.new('\A[^\|]+\z')
|
71
|
-
# $remove_table_regex = Regexp.new('\{\|[^\{\|\}]*?\|\}', Regexp::MULTILINE)
|
72
|
-
# $remove_clade_regex = Regexp.new('\{\{(?:C|c)lade[^\{\}]*\}\}', Regexp::MULTILINE)
|
73
71
|
|
74
72
|
$category_patterns = ["Category", "Categoria"].join("|")
|
75
73
|
$category_regex = Regexp.new('[\{\[\|\b](?:' + $category_patterns + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
|
@@ -340,44 +338,7 @@ module Wp2txt
|
|
340
338
|
end
|
341
339
|
end
|
342
340
|
|
343
|
-
|
344
|
-
|
345
|
-
# def process_template(str)
|
346
|
-
# scanner = StringScanner.new(str)
|
347
|
-
# result = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |contents|
|
348
|
-
# parts = contents.split("|")
|
349
|
-
# case parts.size
|
350
|
-
# when 0
|
351
|
-
# ""
|
352
|
-
# when 1
|
353
|
-
# parts.first || ""
|
354
|
-
# else
|
355
|
-
# if parts.last.split("=").size > 1
|
356
|
-
# parts.first || ""
|
357
|
-
# else
|
358
|
-
# parts.last || ""
|
359
|
-
# end
|
360
|
-
# end
|
361
|
-
# end
|
362
|
-
# result
|
363
|
-
# end
|
364
|
-
|
365
|
-
# def remove_table(str)
|
366
|
-
# new_str = str.gsub($remove_table_regex, "")
|
367
|
-
# if str != new_str
|
368
|
-
# new_str = remove_table(new_str)
|
369
|
-
# end
|
370
|
-
# new_str = remove_table(new_str) unless str == new_str
|
371
|
-
# return new_str
|
372
|
-
# end
|
373
|
-
|
374
|
-
# def remove_clade(page)
|
375
|
-
# new_page = page.gsub($remove_clade_regex, "")
|
376
|
-
# new_page = remove_clade(new_page) unless page == new_page
|
377
|
-
# new_page
|
378
|
-
# end
|
379
|
-
|
380
|
-
#################### file related utilities ####################
|
341
|
+
#################### file related utilities ####################
|
381
342
|
|
382
343
|
# collect filenames recursively
|
383
344
|
def collect_files(str, regex = nil)
|
data/lib/wp2txt/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wp2txt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
@@ -99,7 +99,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
99
99
|
version: '0'
|
100
100
|
requirements: []
|
101
101
|
rubyforge_project: wp2txt
|
102
|
-
rubygems_version: 2.4.
|
102
|
+
rubygems_version: 2.4.3
|
103
103
|
signing_key:
|
104
104
|
specification_version: 4
|
105
105
|
summary: Wikipedia dump to text converter
|