wp2txt 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/spec/utils_spec.rb CHANGED
@@ -1,10 +1,9 @@
1
- #!/usr/bin/env ruby
2
- # -*- coding: utf-8 -*-
1
+ # frozen_string_literal: true
3
2
 
4
- require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
5
- require 'wp2txt'
6
- require 'wp2txt/article'
7
- require 'wp2txt/utils'
3
+ require_relative "spec_helper"
4
+ require_relative "../lib/wp2txt"
5
+ require_relative "../lib/wp2txt/article"
6
+ require_relative "../lib/wp2txt/utils"
8
7
 
9
8
  describe "Wp2txt" do
10
9
  it "contains mediawiki-format related functions:" do
@@ -17,14 +16,14 @@ describe "Wp2txt" do
17
16
 
18
17
  describe "process_nested_structure" do
19
18
  it "parse nested structure replacing str in the format specified" do
20
- str_before = "[[ab[[cde[[alfa]]]]fg]]"
21
- str_after = "<<ab<<cde<<alfa>>>>fg>>"
22
- scanner = StringScanner.new(str_before)
23
- str_processed = process_nested_structure(scanner, "[[", "]]") do |content|
19
+ str_before1 = "[[ab[[cde[[alfa]]]]fg]]"
20
+ str_after1 = "<<ab<<cde<<alfa>>>>fg>>"
21
+ scanner1 = StringScanner.new(str_before1)
22
+ str_processed = process_nested_structure(scanner1, "[[", "]]") do |content|
24
23
  "<<" + content + ">>"
25
24
  end
26
- expect(str_processed).to eq str_after
27
-
25
+ expect(str_processed).to eq str_after1
26
+
28
27
  str_before = "#* {{quote-book|1503|year_published=1836|chapter=19 Henry VII. c. 5: Coin||A Collection of Statutes Connected with the General Administration of the Law|page=158|url=http://books.google.com/books?id=QtYuAAAAIAAJ
29
28
  |passage={{...}} every of them, being gold, whole and weight, shall '''go''' and be current in payment throughout this his realm for the sum that they were coined for.}}"
30
29
  str_after = "#* <<quote-book|1503|year_published=1836|chapter=19 Henry VII. c. 5: Coin||A Collection of Statutes Connected with the General Administration of the Law|page=158|url=http://books.google.com/books?id=QtYuAAAAIAAJ
@@ -33,177 +32,155 @@ describe "Wp2txt" do
33
32
  str_processed = process_nested_structure(scanner, "{{", "}}") do |content|
34
33
  "<<" + content + ">>"
35
34
  end
36
- #str_processed.should == str_after
37
35
  expect(str_processed).to eq str_after
38
-
39
36
  end
40
37
  end
41
-
42
- describe "special_chr!" do
38
+
39
+ describe "special_chr" do
43
40
  it "replaces character references with real characters" do
44
41
  str_before = "&nbsp; &lt; &gt; &amp; &quot;"
45
42
  str_after = "  < > & \""
46
- special_chr!(str_before)
47
- expect(str_before).to eq str_after
48
- end
43
+ expect(special_chr(str_before)).to eq str_after
44
+ end
49
45
  end
50
-
51
- describe "chrref_to_utf!" do
46
+
47
+ describe "chrref_to_utf" do
52
48
  it "replaces character references with real characters" do
53
49
  str_before = "&#x266A;"
54
50
  str_after = "♪"
55
- chrref_to_utf!(str_before)
56
- expect(str_before).to eq str_after
51
+ expect(chrref_to_utf(str_before)).to eq str_after
57
52
  end
58
53
  end
59
-
60
- describe "mndash!" do
54
+
55
+ describe "mndash" do
61
56
  it "replaces {mdash}, {ndash}, or {–} with '–'" do
62
57
  str_before = "{mdash} {ndash} {–}"
63
58
  str_after = "– – –"
64
- mndash!(str_before)
65
- expect(str_before).to eq str_after
59
+ expect(mndash(str_before)).to eq str_after
66
60
  end
67
61
  end
68
-
62
+
69
63
  describe "make_reference" do
70
64
  it "replaces <ref> tag with [ref]" do
71
65
  str_before = "<ref> ... </ref>"
72
66
  str_after = "[ref] ... [/ref]"
73
- make_reference!(str_before)
74
- expect(str_before).to eq str_after
75
- end
67
+ expect(make_reference(str_before)).to eq str_after
68
+ end
76
69
  end
77
-
78
- describe "remove_table!" do
70
+
71
+ describe "remove_table" do
79
72
  it "removes table formated parts" do
80
73
  str_before = "{| ... \n{| ... \n ...|}\n ...|}"
81
74
  str_after = ""
82
- remove_table!(str_before)
83
- expect(str_before).to eq str_after
84
- end
85
- end
86
-
87
- # describe "remove_clade" do
88
- # it "removes clade formated parts" do
89
- # str_before = "\{\{clade ... \n ... \n ... \n\}\}"
90
- # str_after = ""
91
- # expect(remove_clade(str_before)).to eq str_after
92
- # end
93
- # end
94
-
95
- describe "remove_hr!" do
75
+ expect(remove_table(str_before)).to eq str_after
76
+ end
77
+ end
78
+
79
+ describe "remove_hr" do
96
80
  it "removes horizontal lines" do
97
81
  str_before = "\n----\n--\n--\n"
98
82
  str_after = "\n\n"
99
- remove_hr!(str_before)
100
- expect(str_before).to eq str_after
101
- end
83
+ expect(remove_hr(str_before)).to eq str_after
84
+ end
102
85
  end
103
86
 
104
- describe "remove_inbetween!" do
87
+ describe "remove_inbetween" do
105
88
  it "removes tags and its contents" do
106
- str_before = "<tag>abc</tag>"
107
- str_after = "abc"
108
- remove_tag!(str_before)
109
- expect(str_before).to eq str_after
110
- str_before = "[tag]def[/tag]"
111
- str_after = "def"
112
- remove_inbetween!(str_before, ['[', ']'])
113
- expect(str_before).to eq str_after
114
- end
115
- end
116
-
117
- describe "remove_directive!" do
89
+ str_before1 = "<tag>abc</tag>"
90
+ str_after1 = "abc"
91
+ expect(remove_tag(str_before1)).to eq str_after1
92
+
93
+ str_before2 = "[tag]def[/tag]"
94
+ str_after2 = "def"
95
+ expect(remove_inbetween(str_before2, ["[", "]"])).to eq str_after2
96
+ end
97
+ end
98
+
99
+ describe "remove_directive" do
118
100
  it "removes directive" do
119
101
  str_before = "__abc__\n __def__"
120
102
  str_after = "\n "
121
- remove_directive!(str_before)
122
- expect(str_before).to eq str_after
123
- end
103
+ expect(remove_directive(str_before)).to eq str_after
104
+ end
124
105
  end
125
106
 
126
- describe "remove_emphasis!" do
107
+ describe "remove_emphasis" do
127
108
  it "removes directive" do
128
109
  str_before = "''abc''\n'''def'''"
129
110
  str_after = "abc\ndef"
130
- remove_emphasis!(str_before)
131
- expect(str_before).to eq str_after
132
- end
111
+ expect(remove_emphasis(str_before)).to eq str_after
112
+ end
133
113
  end
134
-
135
- describe "escape_nowiki!" do
114
+
115
+ describe "escape_nowiki" do
136
116
  it "replaces <nowiki>...</nowiki> with <nowiki-object_id>" do
137
117
  str_before = "<nowiki>[[abc]]</nowiki>def<nowiki>[[ghi]]</nowiki>"
138
118
  str_after = Regexp.new("<nowiki-\\d+>def<nowiki-\\d+>")
139
- escape_nowiki!(str_before)
140
- expect(str_before).to match str_after
119
+ expect(escape_nowiki(str_before)).to match str_after
141
120
  end
142
121
  end
143
122
 
144
- describe "unescape_nowiki!" do
123
+ describe "unescape_nowiki" do
145
124
  it "replaces <nowiki-object_id> with string stored elsewhere" do
146
- @nowikis = {123 => "[[abc]]", 124 => "[[ghi]]"}
125
+ @nowikis = { 123 => "[[abc]]", 124 => "[[ghi]]" }
147
126
  str_before = "<nowiki-123>def<nowiki-124>"
148
127
  str_after = "[[abc]]def[[ghi]]"
149
- unescape_nowiki!(str_before)
150
- expect(str_before).to eq str_after
128
+ expect(unescape_nowiki(str_before)).to eq str_after
151
129
  end
152
130
  end
153
-
154
- describe "process_interwiki_links!" do
131
+
132
+ describe "process_interwiki_links" do
155
133
  it "formats text link and remove brackets" do
156
- a = "[[a b]]"
157
- b = "[[a b|c]]"
158
- c = "[[a|b|c]]"
159
- d = "[[硬口蓋鼻音|[ɲ], /J/]]"
160
- process_interwiki_links!(a)
161
- process_interwiki_links!(b)
162
- process_interwiki_links!(c)
163
- process_interwiki_links!(d)
164
- expect(a).to eq "a b"
165
- expect(b).to eq "c"
166
- expect(c).to eq "b|c"
167
- expect(d).to eq "[ɲ], /J/"
134
+ a1 = "[[a b]]"
135
+ b1 = "[[a b|c]]"
136
+ c1 = "[[a|b|c]]"
137
+ d1 = "[[硬口蓋鼻音|[ɲ], /J/]]"
138
+ a2 = process_interwiki_links(a1)
139
+ b2 = process_interwiki_links(b1)
140
+ c2 = process_interwiki_links(c1)
141
+ d2 = process_interwiki_links(d1)
142
+ expect(a2).to eq "a b"
143
+ expect(b2).to eq "c"
144
+ expect(c2).to eq "b|c"
145
+ expect(d2).to eq "[ɲ], /J/"
168
146
  end
169
147
  end
170
148
 
171
- describe "process_external_links!" do
149
+ describe "process_external_links" do
172
150
  it "formats text link and remove brackets" do
173
- a = "[http://yohasebe.com yohasebe.com]"
174
- b = "[http://yohasebe.com]"
175
- c = "* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}"
176
- process_external_links!(a)
177
- process_external_links!(b)
178
- process_external_links!(c)
179
- expect(a).to eq "yohasebe.com"
180
- expect(b).to eq "http://yohasebe.com"
181
- expect(c).to eq "* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}"
151
+ a1 = "[http://yohasebe.com yohasebe.com]"
152
+ b1 = "[http://yohasebe.com]"
153
+ c1 = "* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}"
154
+ a2 = process_external_links(a1)
155
+ b2 = process_external_links(b1)
156
+ c2 = process_external_links(c1)
157
+ expect(a2).to eq "yohasebe.com"
158
+ expect(b2).to eq "http://yohasebe.com"
159
+ expect(c2).to eq "* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}"
182
160
  end
183
161
  end
184
-
185
- describe "correct_inline_template!" do
162
+
163
+ describe "correct_inline_template" do
186
164
  it "removes brackets and leaving some text" do
187
- str_before = "{{MedalCountry | {{JPN}} }}"
188
- str_after = "JPN"
189
- correct_inline_template!(str_before)
190
- expect(str_before).to eq str_after
191
- str_before = "{{lang|en|Japan}}"
192
- str_after = "Japan"
193
- correct_inline_template!(str_before)
194
- expect(str_before).to eq str_after
195
- str_before = "{{a|b=c|d=f}}"
196
- str_after = "c"
197
- correct_inline_template!(str_before)
198
- expect(str_before).to eq str_after
199
- str_before = "{{a|b|{{c|d|e}}}}"
200
- str_after = "b"
201
- correct_inline_template!(str_before)
202
- expect(str_before).to eq str_after
203
- str_before = "{{要出典範囲|日本人に多く見受けられる|date=2013年8月|title=日本人特有なのか、本当に多いのかを示す必要がある}}"
204
- str_after = "日本人に多く見受けられる"
205
- correct_inline_template!(str_before)
206
- expect(str_before).to eq str_after
165
+ str_before1 = "{{MedalCountry | {{JPN}} }}"
166
+ str_after1 = "JPN"
167
+ expect(correct_inline_template(str_before1)).to eq str_after1
168
+
169
+ str_before2 = "{{lang|en|Japan}}"
170
+ str_after2 = "Japan"
171
+ expect(correct_inline_template(str_before2)).to eq str_after2
172
+
173
+ str_before3 = "{{a|b=c|d=f}}"
174
+ str_after3 = "c"
175
+ expect(correct_inline_template(str_before3)).to eq str_after3
176
+
177
+ str_before4 = "{{a|b|{{c|d|e}}}}"
178
+ str_after4 = "b"
179
+ expect(correct_inline_template(str_before4)).to eq str_after4
180
+
181
+ str_before5 = "{{要出典範囲|日本人に多く見受けられる|date=2013年8月|title=日本人特有なのか、本当に多いのかを示す必要がある}}"
182
+ str_after5 = "日本人に多く見受けられる"
183
+ expect(correct_inline_template(str_before5)).to eq str_after5
207
184
  end
208
185
  end
209
186
  end
data/wp2txt.gemspec CHANGED
@@ -1,6 +1,6 @@
1
- # -*- coding: utf-8 -*-
2
- $:.push File.expand_path("../lib", __FILE__)
3
- require "wp2txt/version"
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/wp2txt/version"
4
4
 
5
5
  Gem::Specification.new do |s|
6
6
  s.name = "wp2txt"
@@ -8,26 +8,24 @@ Gem::Specification.new do |s|
8
8
  s.authors = ["Yoichiro Hasebe"]
9
9
  s.email = ["yohasebe@gmail.com"]
10
10
  s.homepage = "https://github.com/yohasebe/wp2txt"
11
- s.summary = %q{A command-line toolkit to extract text content and category data from Wikipedia dump files}
12
- s.description = %q{WP2TXT extracts text and category data from Wikipedia dump files (encoded in XML / compressed with Bzip2), removing MediaWiki markup and other metadata.}
13
-
11
+ s.summary = "A command-line toolkit to extract text content and category data from Wikipedia dump files"
12
+ s.description = "WP2TXT extracts text and category data from Wikipedia dump files (encoded in XML / compressed with Bzip2), removing MediaWiki markup and other metadata."
14
13
  s.rubyforge_project = "wp2txt"
15
-
16
- s.files = `git ls-files`.split("\n")
14
+ s.license = "MIT"
15
+ s.required_ruby_version = Gem::Requirement.new(">= 2.6")
16
+ s.files = `git ls-files`.split("\n")
17
17
  s.files -= ["data/*", "image/*"]
18
- s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
19
- s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
18
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
19
+ s.executables = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
20
20
  s.require_paths = ["lib"]
21
-
22
- # s.add_development_dependency "bundler"
23
- # s.add_development_dependency "rspec"
24
- # s.add_development_dependency "rake"
25
-
26
- s.add_dependency "nokogiri"
27
- s.add_dependency "ruby-progressbar"
28
- s.add_dependency "parallel"
21
+ s.add_development_dependency "bundler"
22
+ s.add_development_dependency "rake"
23
+ s.add_development_dependency "rspec"
29
24
  s.add_dependency "htmlentities"
25
+ s.add_dependency "nokogiri"
30
26
  s.add_dependency "optimist"
27
+ s.add_dependency "parallel"
31
28
  s.add_dependency "pastel"
29
+ s.add_dependency "ruby-progressbar"
32
30
  s.add_dependency "tty-spinner"
33
31
  end
metadata CHANGED
@@ -1,23 +1,23 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wp2txt
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-08-11 00:00:00.000000000 Z
11
+ date: 2023-01-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: nokogiri
14
+ name: bundler
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
17
  - - ">="
18
18
  - !ruby/object:Gem::Version
19
19
  version: '0'
20
- type: :runtime
20
+ type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
@@ -25,13 +25,13 @@ dependencies:
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
27
  - !ruby/object:Gem::Dependency
28
- name: ruby-progressbar
28
+ name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - ">="
32
32
  - !ruby/object:Gem::Version
33
33
  version: '0'
34
- type: :runtime
34
+ type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
@@ -39,13 +39,13 @@ dependencies:
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
- name: parallel
42
+ name: rspec
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - ">="
46
46
  - !ruby/object:Gem::Version
47
47
  version: '0'
48
- type: :runtime
48
+ type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - ">="
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: nokogiri
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: optimist
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -80,6 +94,20 @@ dependencies:
80
94
  - - ">="
81
95
  - !ruby/object:Gem::Version
82
96
  version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: parallel
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
83
111
  - !ruby/object:Gem::Dependency
84
112
  name: pastel
85
113
  requirement: !ruby/object:Gem::Requirement
@@ -94,6 +122,20 @@ dependencies:
94
122
  - - ">="
95
123
  - !ruby/object:Gem::Version
96
124
  version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: ruby-progressbar
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :runtime
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
97
139
  - !ruby/object:Gem::Dependency
98
140
  name: tty-spinner
99
141
  requirement: !ruby/object:Gem::Requirement
@@ -117,7 +159,12 @@ executables:
117
159
  extensions: []
118
160
  extra_rdoc_files: []
119
161
  files:
162
+ - ".dockerignore"
163
+ - ".github/workflows/ci.yml"
120
164
  - ".gitignore"
165
+ - ".rubocop.yml"
166
+ - ".solargraph.yml"
167
+ - Dockerfile
121
168
  - Gemfile
122
169
  - LICENSE
123
170
  - README.md
@@ -136,13 +183,15 @@ files:
136
183
  - image/wp2txt.svg
137
184
  - lib/wp2txt.rb
138
185
  - lib/wp2txt/article.rb
186
+ - lib/wp2txt/regex.rb
139
187
  - lib/wp2txt/utils.rb
140
188
  - lib/wp2txt/version.rb
141
189
  - spec/spec_helper.rb
142
190
  - spec/utils_spec.rb
143
191
  - wp2txt.gemspec
144
192
  homepage: https://github.com/yohasebe/wp2txt
145
- licenses: []
193
+ licenses:
194
+ - MIT
146
195
  metadata: {}
147
196
  post_install_message:
148
197
  rdoc_options: []
@@ -152,14 +201,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
152
201
  requirements:
153
202
  - - ">="
154
203
  - !ruby/object:Gem::Version
155
- version: '0'
204
+ version: '2.6'
156
205
  required_rubygems_version: !ruby/object:Gem::Requirement
157
206
  requirements:
158
207
  - - ">="
159
208
  - !ruby/object:Gem::Version
160
209
  version: '0'
161
210
  requirements: []
162
- rubygems_version: 3.3.7
211
+ rubygems_version: 3.4.1
163
212
  signing_key:
164
213
  specification_version: 4
165
214
  summary: A command-line toolkit to extract text content and category data from Wikipedia