wp2txt 0.7.5 → 0.7.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/wp2txt +4 -3
- data/lib/wp2txt/utils.rb +1 -40
- data/lib/wp2txt/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b4dff862e52128851fb4db35a59a1f4e3e9e473f
|
4
|
+
data.tar.gz: e38612ada2785a2fed1e975460451b0d0a703e5e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 19159a3706d0dfb687c239f3200ac3685916e63a4e3ec292d3e47acf4e8787c5ed728837bdf4b5586691adacdc75decddc1b2c7937ad483ca15a3663213d32c9
|
7
|
+
data.tar.gz: 8d956611e29eb4fa0d058ab6459bd96167352fdb5ee44b1819fe4688983a8ebf146cc2413e609029558d845bf9e93f9e27eaca9a35f34bf4644c234a4809146c
|
data/bin/wp2txt
CHANGED
@@ -127,8 +127,9 @@ wpconv.extract_text do |article|
|
|
127
127
|
if /\A\s*\z/m =~ contents
|
128
128
|
result = ""
|
129
129
|
else
|
130
|
-
result = config[:title] ? title + "\n"
|
130
|
+
result = config[:title] ? title + "\n" << contents : contents
|
131
131
|
end
|
132
|
-
result
|
133
|
-
result
|
132
|
+
result.gsub!(/\[ref\]\s*\[\/ref\]/m){""}
|
133
|
+
result.gsub!(/\n\n\n+/m){"\n\n"}
|
134
|
+
result << "\n"
|
134
135
|
end
|
data/lib/wp2txt/utils.rb
CHANGED
@@ -68,8 +68,6 @@ $list_marks_regex = Regexp.new('\A[\*\#\;\:\ ]+')
|
|
68
68
|
$pre_marks_regex = Regexp.new('\A\^\ ')
|
69
69
|
$def_marks_regex = Regexp.new('\A[\;\:\ ]+')
|
70
70
|
$onset_bar_regex = Regexp.new('\A[^\|]+\z')
|
71
|
-
# $remove_table_regex = Regexp.new('\{\|[^\{\|\}]*?\|\}', Regexp::MULTILINE)
|
72
|
-
# $remove_clade_regex = Regexp.new('\{\{(?:C|c)lade[^\{\}]*\}\}', Regexp::MULTILINE)
|
73
71
|
|
74
72
|
$category_patterns = ["Category", "Categoria"].join("|")
|
75
73
|
$category_regex = Regexp.new('[\{\[\|\b](?:' + $category_patterns + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
|
@@ -340,44 +338,7 @@ module Wp2txt
|
|
340
338
|
end
|
341
339
|
end
|
342
340
|
|
343
|
-
|
344
|
-
|
345
|
-
# def process_template(str)
|
346
|
-
# scanner = StringScanner.new(str)
|
347
|
-
# result = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |contents|
|
348
|
-
# parts = contents.split("|")
|
349
|
-
# case parts.size
|
350
|
-
# when 0
|
351
|
-
# ""
|
352
|
-
# when 1
|
353
|
-
# parts.first || ""
|
354
|
-
# else
|
355
|
-
# if parts.last.split("=").size > 1
|
356
|
-
# parts.first || ""
|
357
|
-
# else
|
358
|
-
# parts.last || ""
|
359
|
-
# end
|
360
|
-
# end
|
361
|
-
# end
|
362
|
-
# result
|
363
|
-
# end
|
364
|
-
|
365
|
-
# def remove_table(str)
|
366
|
-
# new_str = str.gsub($remove_table_regex, "")
|
367
|
-
# if str != new_str
|
368
|
-
# new_str = remove_table(new_str)
|
369
|
-
# end
|
370
|
-
# new_str = remove_table(new_str) unless str == new_str
|
371
|
-
# return new_str
|
372
|
-
# end
|
373
|
-
|
374
|
-
# def remove_clade(page)
|
375
|
-
# new_page = page.gsub($remove_clade_regex, "")
|
376
|
-
# new_page = remove_clade(new_page) unless page == new_page
|
377
|
-
# new_page
|
378
|
-
# end
|
379
|
-
|
380
|
-
#################### file related utilities ####################
|
341
|
+
#################### file related utilities ####################
|
381
342
|
|
382
343
|
# collect filenames recursively
|
383
344
|
def collect_files(str, regex = nil)
|
data/lib/wp2txt/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wp2txt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
@@ -99,7 +99,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
99
99
|
version: '0'
|
100
100
|
requirements: []
|
101
101
|
rubyforge_project: wp2txt
|
102
|
-
rubygems_version: 2.4.
|
102
|
+
rubygems_version: 2.4.3
|
103
103
|
signing_key:
|
104
104
|
specification_version: 4
|
105
105
|
summary: Wikipedia dump to text converter
|