wp2txt 1.0.2 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +8 -0
- data/.github/workflows/ci.yml +36 -0
- data/.gitignore +1 -1
- data/.rubocop.yml +80 -0
- data/.solargraph.yml +22 -0
- data/Dockerfile +20 -0
- data/Gemfile +9 -2
- data/README.md +26 -3
- data/Rakefile +25 -4
- data/bin/wp2txt +177 -182
- data/lib/wp2txt/article.rb +70 -92
- data/lib/wp2txt/regex.rb +93 -0
- data/lib/wp2txt/utils.rb +159 -270
- data/lib/wp2txt/version.rb +3 -1
- data/lib/wp2txt.rb +119 -150
- data/spec/spec_helper.rb +4 -4
- data/spec/utils_spec.rb +101 -124
- data/wp2txt.gemspec +16 -18
- metadata +60 -12
- data/tags +0 -58
data/spec/utils_spec.rb
CHANGED
@@ -1,10 +1,9 @@
|
|
1
|
-
|
2
|
-
# -*- coding: utf-8 -*-
|
1
|
+
# frozen_string_literal: true
|
3
2
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
3
|
+
require_relative "spec_helper"
|
4
|
+
require_relative "../lib/wp2txt"
|
5
|
+
require_relative "../lib/wp2txt/article"
|
6
|
+
require_relative "../lib/wp2txt/utils"
|
8
7
|
|
9
8
|
describe "Wp2txt" do
|
10
9
|
it "contains mediawiki-format related functions:" do
|
@@ -17,14 +16,14 @@ describe "Wp2txt" do
|
|
17
16
|
|
18
17
|
describe "process_nested_structure" do
|
19
18
|
it "parse nested structure replacing str in the format specified" do
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
str_processed = process_nested_structure(
|
19
|
+
str_before1 = "[[ab[[cde[[alfa]]]]fg]]"
|
20
|
+
str_after1 = "<<ab<<cde<<alfa>>>>fg>>"
|
21
|
+
scanner1 = StringScanner.new(str_before1)
|
22
|
+
str_processed = process_nested_structure(scanner1, "[[", "]]") do |content|
|
24
23
|
"<<" + content + ">>"
|
25
24
|
end
|
26
|
-
expect(str_processed).to eq
|
27
|
-
|
25
|
+
expect(str_processed).to eq str_after1
|
26
|
+
|
28
27
|
str_before = "#* {{quote-book|1503|year_published=1836|chapter=19 Henry VII. c. 5: Coin||A Collection of Statutes Connected with the General Administration of the Law|page=158|url=http://books.google.com/books?id=QtYuAAAAIAAJ
|
29
28
|
|passage={{...}} every of them, being gold, whole and weight, shall '''go''' and be current in payment throughout this his realm for the sum that they were coined for.}}"
|
30
29
|
str_after = "#* <<quote-book|1503|year_published=1836|chapter=19 Henry VII. c. 5: Coin||A Collection of Statutes Connected with the General Administration of the Law|page=158|url=http://books.google.com/books?id=QtYuAAAAIAAJ
|
@@ -33,177 +32,155 @@ describe "Wp2txt" do
|
|
33
32
|
str_processed = process_nested_structure(scanner, "{{", "}}") do |content|
|
34
33
|
"<<" + content + ">>"
|
35
34
|
end
|
36
|
-
#str_processed.should == str_after
|
37
35
|
expect(str_processed).to eq str_after
|
38
|
-
|
39
36
|
end
|
40
37
|
end
|
41
|
-
|
42
|
-
describe "special_chr
|
38
|
+
|
39
|
+
describe "special_chr" do
|
43
40
|
it "replaces character references with real characters" do
|
44
41
|
str_before = " < > & ""
|
45
42
|
str_after = " < > & \""
|
46
|
-
special_chr
|
47
|
-
|
48
|
-
end
|
43
|
+
expect(special_chr(str_before)).to eq str_after
|
44
|
+
end
|
49
45
|
end
|
50
|
-
|
51
|
-
describe "chrref_to_utf
|
46
|
+
|
47
|
+
describe "chrref_to_utf" do
|
52
48
|
it "replaces character references with real characters" do
|
53
49
|
str_before = "♪"
|
54
50
|
str_after = "♪"
|
55
|
-
chrref_to_utf
|
56
|
-
expect(str_before).to eq str_after
|
51
|
+
expect(chrref_to_utf(str_before)).to eq str_after
|
57
52
|
end
|
58
53
|
end
|
59
|
-
|
60
|
-
describe "mndash
|
54
|
+
|
55
|
+
describe "mndash" do
|
61
56
|
it "replaces {mdash}, {ndash}, or {–} with '–'" do
|
62
57
|
str_before = "{mdash} {ndash} {–}"
|
63
58
|
str_after = "– – –"
|
64
|
-
mndash
|
65
|
-
expect(str_before).to eq str_after
|
59
|
+
expect(mndash(str_before)).to eq str_after
|
66
60
|
end
|
67
61
|
end
|
68
|
-
|
62
|
+
|
69
63
|
describe "make_reference" do
|
70
64
|
it "replaces <ref> tag with [ref]" do
|
71
65
|
str_before = "<ref> ... </ref>"
|
72
66
|
str_after = "[ref] ... [/ref]"
|
73
|
-
make_reference
|
74
|
-
|
75
|
-
end
|
67
|
+
expect(make_reference(str_before)).to eq str_after
|
68
|
+
end
|
76
69
|
end
|
77
|
-
|
78
|
-
describe "remove_table
|
70
|
+
|
71
|
+
describe "remove_table" do
|
79
72
|
it "removes table formated parts" do
|
80
73
|
str_before = "{| ... \n{| ... \n ...|}\n ...|}"
|
81
74
|
str_after = ""
|
82
|
-
remove_table
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
# describe "remove_clade" do
|
88
|
-
# it "removes clade formated parts" do
|
89
|
-
# str_before = "\{\{clade ... \n ... \n ... \n\}\}"
|
90
|
-
# str_after = ""
|
91
|
-
# expect(remove_clade(str_before)).to eq str_after
|
92
|
-
# end
|
93
|
-
# end
|
94
|
-
|
95
|
-
describe "remove_hr!" do
|
75
|
+
expect(remove_table(str_before)).to eq str_after
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
describe "remove_hr" do
|
96
80
|
it "removes horizontal lines" do
|
97
81
|
str_before = "\n----\n--\n--\n"
|
98
82
|
str_after = "\n\n"
|
99
|
-
remove_hr
|
100
|
-
|
101
|
-
end
|
83
|
+
expect(remove_hr(str_before)).to eq str_after
|
84
|
+
end
|
102
85
|
end
|
103
86
|
|
104
|
-
describe "remove_inbetween
|
87
|
+
describe "remove_inbetween" do
|
105
88
|
it "removes tags and its contents" do
|
106
|
-
|
107
|
-
|
108
|
-
remove_tag
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
remove_inbetween
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
describe "remove_directive!" do
|
89
|
+
str_before1 = "<tag>abc</tag>"
|
90
|
+
str_after1 = "abc"
|
91
|
+
expect(remove_tag(str_before1)).to eq str_after1
|
92
|
+
|
93
|
+
str_before2 = "[tag]def[/tag]"
|
94
|
+
str_after2 = "def"
|
95
|
+
expect(remove_inbetween(str_before2, ["[", "]"])).to eq str_after2
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
describe "remove_directive" do
|
118
100
|
it "removes directive" do
|
119
101
|
str_before = "__abc__\n __def__"
|
120
102
|
str_after = "\n "
|
121
|
-
remove_directive
|
122
|
-
|
123
|
-
end
|
103
|
+
expect(remove_directive(str_before)).to eq str_after
|
104
|
+
end
|
124
105
|
end
|
125
106
|
|
126
|
-
describe "remove_emphasis
|
107
|
+
describe "remove_emphasis" do
|
127
108
|
it "removes directive" do
|
128
109
|
str_before = "''abc''\n'''def'''"
|
129
110
|
str_after = "abc\ndef"
|
130
|
-
remove_emphasis
|
131
|
-
|
132
|
-
end
|
111
|
+
expect(remove_emphasis(str_before)).to eq str_after
|
112
|
+
end
|
133
113
|
end
|
134
|
-
|
135
|
-
describe "escape_nowiki
|
114
|
+
|
115
|
+
describe "escape_nowiki" do
|
136
116
|
it "replaces <nowiki>...</nowiki> with <nowiki-object_id>" do
|
137
117
|
str_before = "<nowiki>[[abc]]</nowiki>def<nowiki>[[ghi]]</nowiki>"
|
138
118
|
str_after = Regexp.new("<nowiki-\\d+>def<nowiki-\\d+>")
|
139
|
-
escape_nowiki
|
140
|
-
expect(str_before).to match str_after
|
119
|
+
expect(escape_nowiki(str_before)).to match str_after
|
141
120
|
end
|
142
121
|
end
|
143
122
|
|
144
|
-
describe "unescape_nowiki
|
123
|
+
describe "unescape_nowiki" do
|
145
124
|
it "replaces <nowiki-object_id> with string stored elsewhere" do
|
146
|
-
@nowikis = {123 => "[[abc]]", 124 => "[[ghi]]"}
|
125
|
+
@nowikis = { 123 => "[[abc]]", 124 => "[[ghi]]" }
|
147
126
|
str_before = "<nowiki-123>def<nowiki-124>"
|
148
127
|
str_after = "[[abc]]def[[ghi]]"
|
149
|
-
unescape_nowiki
|
150
|
-
expect(str_before).to eq str_after
|
128
|
+
expect(unescape_nowiki(str_before)).to eq str_after
|
151
129
|
end
|
152
130
|
end
|
153
|
-
|
154
|
-
describe "process_interwiki_links
|
131
|
+
|
132
|
+
describe "process_interwiki_links" do
|
155
133
|
it "formats text link and remove brackets" do
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
process_interwiki_links
|
161
|
-
process_interwiki_links
|
162
|
-
process_interwiki_links
|
163
|
-
process_interwiki_links
|
164
|
-
expect(
|
165
|
-
expect(
|
166
|
-
expect(
|
167
|
-
expect(
|
134
|
+
a1 = "[[a b]]"
|
135
|
+
b1 = "[[a b|c]]"
|
136
|
+
c1 = "[[a|b|c]]"
|
137
|
+
d1 = "[[硬口蓋鼻音|[ɲ], /J/]]"
|
138
|
+
a2 = process_interwiki_links(a1)
|
139
|
+
b2 = process_interwiki_links(b1)
|
140
|
+
c2 = process_interwiki_links(c1)
|
141
|
+
d2 = process_interwiki_links(d1)
|
142
|
+
expect(a2).to eq "a b"
|
143
|
+
expect(b2).to eq "c"
|
144
|
+
expect(c2).to eq "b|c"
|
145
|
+
expect(d2).to eq "[ɲ], /J/"
|
168
146
|
end
|
169
147
|
end
|
170
148
|
|
171
|
-
describe "process_external_links
|
149
|
+
describe "process_external_links" do
|
172
150
|
it "formats text link and remove brackets" do
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
process_external_links
|
177
|
-
process_external_links
|
178
|
-
process_external_links
|
179
|
-
expect(
|
180
|
-
expect(
|
181
|
-
expect(
|
151
|
+
a1 = "[http://yohasebe.com yohasebe.com]"
|
152
|
+
b1 = "[http://yohasebe.com]"
|
153
|
+
c1 = "* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}"
|
154
|
+
a2 = process_external_links(a1)
|
155
|
+
b2 = process_external_links(b1)
|
156
|
+
c2 = process_external_links(c1)
|
157
|
+
expect(a2).to eq "yohasebe.com"
|
158
|
+
expect(b2).to eq "http://yohasebe.com"
|
159
|
+
expect(c2).to eq "* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}"
|
182
160
|
end
|
183
161
|
end
|
184
|
-
|
185
|
-
describe "correct_inline_template
|
162
|
+
|
163
|
+
describe "correct_inline_template" do
|
186
164
|
it "removes brackets and leaving some text" do
|
187
|
-
|
188
|
-
|
189
|
-
correct_inline_template
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
correct_inline_template
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
correct_inline_template
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
correct_inline_template
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
correct_inline_template
|
206
|
-
expect(str_before).to eq str_after
|
165
|
+
str_before1 = "{{MedalCountry | {{JPN}} }}"
|
166
|
+
str_after1 = "JPN"
|
167
|
+
expect(correct_inline_template(str_before1)).to eq str_after1
|
168
|
+
|
169
|
+
str_before2 = "{{lang|en|Japan}}"
|
170
|
+
str_after2 = "Japan"
|
171
|
+
expect(correct_inline_template(str_before2)).to eq str_after2
|
172
|
+
|
173
|
+
str_before3 = "{{a|b=c|d=f}}"
|
174
|
+
str_after3 = "c"
|
175
|
+
expect(correct_inline_template(str_before3)).to eq str_after3
|
176
|
+
|
177
|
+
str_before4 = "{{a|b|{{c|d|e}}}}"
|
178
|
+
str_after4 = "b"
|
179
|
+
expect(correct_inline_template(str_before4)).to eq str_after4
|
180
|
+
|
181
|
+
str_before5 = "{{要出典範囲|日本人に多く見受けられる|date=2013年8月|title=日本人特有なのか、本当に多いのかを示す必要がある}}"
|
182
|
+
str_after5 = "日本人に多く見受けられる"
|
183
|
+
expect(correct_inline_template(str_before5)).to eq str_after5
|
207
184
|
end
|
208
185
|
end
|
209
186
|
end
|
data/wp2txt.gemspec
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "lib/wp2txt/version"
|
4
4
|
|
5
5
|
Gem::Specification.new do |s|
|
6
6
|
s.name = "wp2txt"
|
@@ -8,26 +8,24 @@ Gem::Specification.new do |s|
|
|
8
8
|
s.authors = ["Yoichiro Hasebe"]
|
9
9
|
s.email = ["yohasebe@gmail.com"]
|
10
10
|
s.homepage = "https://github.com/yohasebe/wp2txt"
|
11
|
-
s.summary =
|
12
|
-
s.description =
|
13
|
-
|
11
|
+
s.summary = "A command-line toolkit to extract text content and category data from Wikipedia dump files"
|
12
|
+
s.description = "WP2TXT extracts text and category data from Wikipedia dump files (encoded in XML / compressed with Bzip2), removing MediaWiki markup and other metadata."
|
14
13
|
s.rubyforge_project = "wp2txt"
|
15
|
-
|
16
|
-
s.
|
14
|
+
s.license = "MIT"
|
15
|
+
s.required_ruby_version = Gem::Requirement.new(">= 2.6")
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
17
|
s.files -= ["data/*", "image/*"]
|
18
|
-
s.test_files
|
19
|
-
s.executables
|
18
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
19
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
|
20
20
|
s.require_paths = ["lib"]
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
# s.add_development_dependency "rake"
|
25
|
-
|
26
|
-
s.add_dependency "nokogiri"
|
27
|
-
s.add_dependency "ruby-progressbar"
|
28
|
-
s.add_dependency "parallel"
|
21
|
+
s.add_development_dependency "bundler"
|
22
|
+
s.add_development_dependency "rake"
|
23
|
+
s.add_development_dependency "rspec"
|
29
24
|
s.add_dependency "htmlentities"
|
25
|
+
s.add_dependency "nokogiri"
|
30
26
|
s.add_dependency "optimist"
|
27
|
+
s.add_dependency "parallel"
|
31
28
|
s.add_dependency "pastel"
|
29
|
+
s.add_dependency "ruby-progressbar"
|
32
30
|
s.add_dependency "tty-spinner"
|
33
31
|
end
|
metadata
CHANGED
@@ -1,23 +1,23 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wp2txt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2023-01-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: bundler
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
17
|
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '0'
|
20
|
-
type: :
|
20
|
+
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
@@ -25,13 +25,13 @@ dependencies:
|
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: rake
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: '0'
|
34
|
-
type: :
|
34
|
+
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
@@ -39,13 +39,13 @@ dependencies:
|
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: rspec
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - ">="
|
46
46
|
- !ruby/object:Gem::Version
|
47
47
|
version: '0'
|
48
|
-
type: :
|
48
|
+
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - ">="
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: nokogiri
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: optimist
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -80,6 +94,20 @@ dependencies:
|
|
80
94
|
- - ">="
|
81
95
|
- !ruby/object:Gem::Version
|
82
96
|
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: parallel
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
83
111
|
- !ruby/object:Gem::Dependency
|
84
112
|
name: pastel
|
85
113
|
requirement: !ruby/object:Gem::Requirement
|
@@ -94,6 +122,20 @@ dependencies:
|
|
94
122
|
- - ">="
|
95
123
|
- !ruby/object:Gem::Version
|
96
124
|
version: '0'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: ruby-progressbar
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - ">="
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '0'
|
132
|
+
type: :runtime
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - ">="
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
97
139
|
- !ruby/object:Gem::Dependency
|
98
140
|
name: tty-spinner
|
99
141
|
requirement: !ruby/object:Gem::Requirement
|
@@ -117,7 +159,12 @@ executables:
|
|
117
159
|
extensions: []
|
118
160
|
extra_rdoc_files: []
|
119
161
|
files:
|
162
|
+
- ".dockerignore"
|
163
|
+
- ".github/workflows/ci.yml"
|
120
164
|
- ".gitignore"
|
165
|
+
- ".rubocop.yml"
|
166
|
+
- ".solargraph.yml"
|
167
|
+
- Dockerfile
|
121
168
|
- Gemfile
|
122
169
|
- LICENSE
|
123
170
|
- README.md
|
@@ -136,14 +183,15 @@ files:
|
|
136
183
|
- image/wp2txt.svg
|
137
184
|
- lib/wp2txt.rb
|
138
185
|
- lib/wp2txt/article.rb
|
186
|
+
- lib/wp2txt/regex.rb
|
139
187
|
- lib/wp2txt/utils.rb
|
140
188
|
- lib/wp2txt/version.rb
|
141
189
|
- spec/spec_helper.rb
|
142
190
|
- spec/utils_spec.rb
|
143
|
-
- tags
|
144
191
|
- wp2txt.gemspec
|
145
192
|
homepage: https://github.com/yohasebe/wp2txt
|
146
|
-
licenses:
|
193
|
+
licenses:
|
194
|
+
- MIT
|
147
195
|
metadata: {}
|
148
196
|
post_install_message:
|
149
197
|
rdoc_options: []
|
@@ -153,14 +201,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
153
201
|
requirements:
|
154
202
|
- - ">="
|
155
203
|
- !ruby/object:Gem::Version
|
156
|
-
version: '
|
204
|
+
version: '2.6'
|
157
205
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
158
206
|
requirements:
|
159
207
|
- - ">="
|
160
208
|
- !ruby/object:Gem::Version
|
161
209
|
version: '0'
|
162
210
|
requirements: []
|
163
|
-
rubygems_version: 3.
|
211
|
+
rubygems_version: 3.4.1
|
164
212
|
signing_key:
|
165
213
|
specification_version: 4
|
166
214
|
summary: A command-line toolkit to extract text content and category data from Wikipedia
|
data/tags
DELETED
@@ -1,58 +0,0 @@
|
|
1
|
-
!_TAG_FILE_FORMAT 2 /extended format; --format=1 will not append ;" to lines/
|
2
|
-
!_TAG_FILE_SORTED 1 /0=unsorted, 1=sorted, 2=foldcase/
|
3
|
-
!_TAG_PROGRAM_AUTHOR Darren Hiebert /dhiebert@users.sourceforge.net/
|
4
|
-
!_TAG_PROGRAM_NAME Exuberant Ctags //
|
5
|
-
!_TAG_PROGRAM_URL http://ctags.sourceforge.net /official site/
|
6
|
-
!_TAG_PROGRAM_VERSION 5.8 //
|
7
|
-
Article lib/wp2txt/article.rb /^ class Article$/;" c class:Wp2txt
|
8
|
-
Runner lib/wp2txt.rb /^ class Runner$/;" c class:Wp2txt.Splitter.file_size
|
9
|
-
Splitter lib/wp2txt.rb /^ class Splitter$/;" c class:Wp2txt
|
10
|
-
Wp2txt lib/wp2txt.rb /^module Wp2txt$/;" m
|
11
|
-
Wp2txt lib/wp2txt/article.rb /^module Wp2txt$/;" m
|
12
|
-
Wp2txt lib/wp2txt/utils.rb /^module Wp2txt$/;" m
|
13
|
-
Wp2txt lib/wp2txt/version.rb /^module Wp2txt$/;" m
|
14
|
-
batch_file_mod lib/wp2txt/utils.rb /^ def batch_file_mod(dir_path, &block)$/;" f
|
15
|
-
chrref_to_utf! lib/wp2txt/utils.rb /^ def chrref_to_utf!(num_str)$/;" f
|
16
|
-
cleanup! lib/wp2txt/utils.rb /^ def cleanup!(text)$/;" f
|
17
|
-
collect_files lib/wp2txt/utils.rb /^ def collect_files(str, regex = nil)$/;" f
|
18
|
-
command_exist? lib/wp2txt.rb /^ def command_exist?(command)$/;" f class:Wp2txt.Splitter.file_size
|
19
|
-
convert_characters! lib/wp2txt/utils.rb /^ def convert_characters!(text, has_retried = false)$/;" f class:Wp2txt
|
20
|
-
correct_inline_template! lib/wp2txt/utils.rb /^ def correct_inline_template!(str)$/;" f
|
21
|
-
correct_separator lib/wp2txt/utils.rb /^ def correct_separator(input)$/;" f
|
22
|
-
create_element lib/wp2txt/article.rb /^ def create_element(tp, text)$/;" f class:Wp2txt.Article
|
23
|
-
escape_nowiki! lib/wp2txt/utils.rb /^ def escape_nowiki!(str)$/;" f
|
24
|
-
extract_text lib/wp2txt.rb /^ def extract_text(&block)$/;" f class:Wp2txt.Splitter.file_size.Runner.fill_buffer
|
25
|
-
file_mod lib/wp2txt/utils.rb /^ def file_mod(file_path, backup = false, &block)$/;" f
|
26
|
-
file_size lib/wp2txt.rb /^ def file_size(file)$/;" f class:Wp2txt.Splitter
|
27
|
-
fill_buffer lib/wp2txt.rb /^ def fill_buffer$/;" f class:Wp2txt.Splitter.file_size
|
28
|
-
fill_buffer lib/wp2txt.rb /^ def fill_buffer$/;" f class:Wp2txt.Splitter.file_size.Runner
|
29
|
-
format_wiki! lib/wp2txt/utils.rb /^ def format_wiki!(text, has_retried = false)$/;" f
|
30
|
-
get_newline lib/wp2txt.rb /^ def get_newline$/;" f class:Wp2txt.Splitter.file_size.Runner.fill_buffer
|
31
|
-
get_newline lib/wp2txt.rb /^ def get_newline$/;" f class:Wp2txt.Splitter.file_size.fill_buffer
|
32
|
-
get_page lib/wp2txt.rb /^ def get_page$/;" f class:Wp2txt.Splitter.file_size.Runner.fill_buffer
|
33
|
-
initialize lib/wp2txt.rb /^ def initialize(input_file, output_dir = ".", strip_tmarker = false, del_interfile = true)$/;" f class:Wp2txt.Splitter.file_size.Runner
|
34
|
-
initialize lib/wp2txt.rb /^ def initialize(input_file, output_dir = ".", tfile_size = 10, bz2_gem = false)$/;" f class:Wp2txt.Splitter
|
35
|
-
initialize lib/wp2txt/article.rb /^ def initialize(text, title = "", strip_tmarker = false)$/;" f class:Wp2txt.Article
|
36
|
-
make_reference! lib/wp2txt/utils.rb /^ def make_reference!(str)$/;" f
|
37
|
-
mndash! lib/wp2txt/utils.rb /^ def mndash!(str)$/;" f
|
38
|
-
parse lib/wp2txt/article.rb /^ def parse(source)$/;" f class:Wp2txt.Article
|
39
|
-
prepare lib/wp2txt.rb /^ def prepare$/;" f class:Wp2txt.Splitter.file_size
|
40
|
-
prepare lib/wp2txt.rb /^ def prepare$/;" f class:Wp2txt.Splitter.file_size.Runner
|
41
|
-
process_external_links! lib/wp2txt/utils.rb /^ def process_external_links!(str)$/;" f
|
42
|
-
process_interwiki_links! lib/wp2txt/utils.rb /^ def process_interwiki_links!(str)$/;" f
|
43
|
-
process_nested_structure lib/wp2txt/utils.rb /^ def process_nested_structure(scanner, left, right, &block)$/;" f
|
44
|
-
remove_complex! lib/wp2txt/utils.rb /^ def remove_complex!(str)$/;" f
|
45
|
-
remove_directive! lib/wp2txt/utils.rb /^ def remove_directive!(str)$/;" f
|
46
|
-
remove_emphasis! lib/wp2txt/utils.rb /^ def remove_emphasis!(str)$/;" f
|
47
|
-
remove_hr! lib/wp2txt/utils.rb /^ def remove_hr!(str)$/;" f
|
48
|
-
remove_html! lib/wp2txt/utils.rb /^ def remove_html!(str)$/;" f
|
49
|
-
remove_inbetween! lib/wp2txt/utils.rb /^ def remove_inbetween!(str, tagset = ['<', '>'])$/;" f
|
50
|
-
remove_ref! lib/wp2txt/utils.rb /^ def remove_ref!(str)$/;" f
|
51
|
-
remove_table! lib/wp2txt/utils.rb /^ def remove_table!(str)$/;" f
|
52
|
-
remove_tag! lib/wp2txt/utils.rb /^ def remove_tag!(str)$/;" f
|
53
|
-
remove_templates! lib/wp2txt/utils.rb /^ def remove_templates!(str)$/;" f
|
54
|
-
rename lib/wp2txt/utils.rb /^ def rename(files, ext = "txt")$/;" f
|
55
|
-
sec_to_str lib/wp2txt/utils.rb /^ def sec_to_str(int)$/;" f
|
56
|
-
special_chr! lib/wp2txt/utils.rb /^ def special_chr!(str)$/;" f
|
57
|
-
split_file lib/wp2txt.rb /^ def split_file$/;" f class:Wp2txt.Splitter.file_size.fill_buffer
|
58
|
-
unescape_nowiki! lib/wp2txt/utils.rb /^ def unescape_nowiki!(str)$/;" f
|