wp2txt 1.0.1 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/spec/utils_spec.rb CHANGED
@@ -1,10 +1,9 @@
1
- #!/usr/bin/env ruby
2
- # -*- coding: utf-8 -*-
1
+ # frozen_string_literal: true
3
2
 
4
- require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
5
- require 'wp2txt'
6
- require 'wp2txt/article'
7
- require 'wp2txt/utils'
3
+ require_relative "spec_helper"
4
+ require_relative "../lib/wp2txt"
5
+ require_relative "../lib/wp2txt/article"
6
+ require_relative "../lib/wp2txt/utils"
8
7
 
9
8
  describe "Wp2txt" do
10
9
  it "contains mediawiki-format related functions:" do
@@ -17,14 +16,14 @@ describe "Wp2txt" do
17
16
 
18
17
  describe "process_nested_structure" do
19
18
  it "parse nested structure replacing str in the format specified" do
20
- str_before = "[[ab[[cde[[alfa]]]]fg]]"
21
- str_after = "<<ab<<cde<<alfa>>>>fg>>"
22
- scanner = StringScanner.new(str_before)
23
- str_processed = process_nested_structure(scanner, "[[", "]]") do |content|
19
+ str_before1 = "[[ab[[cde[[alfa]]]]fg]]"
20
+ str_after1 = "<<ab<<cde<<alfa>>>>fg>>"
21
+ scanner1 = StringScanner.new(str_before1)
22
+ str_processed = process_nested_structure(scanner1, "[[", "]]") do |content|
24
23
  "<<" + content + ">>"
25
24
  end
26
- expect(str_processed).to eq str_after
27
-
25
+ expect(str_processed).to eq str_after1
26
+
28
27
  str_before = "#* {{quote-book|1503|year_published=1836|chapter=19 Henry VII. c. 5: Coin||A Collection of Statutes Connected with the General Administration of the Law|page=158|url=http://books.google.com/books?id=QtYuAAAAIAAJ
29
28
  |passage={{...}} every of them, being gold, whole and weight, shall '''go''' and be current in payment throughout this his realm for the sum that they were coined for.}}"
30
29
  str_after = "#* <<quote-book|1503|year_published=1836|chapter=19 Henry VII. c. 5: Coin||A Collection of Statutes Connected with the General Administration of the Law|page=158|url=http://books.google.com/books?id=QtYuAAAAIAAJ
@@ -33,177 +32,155 @@ describe "Wp2txt" do
33
32
  str_processed = process_nested_structure(scanner, "{{", "}}") do |content|
34
33
  "<<" + content + ">>"
35
34
  end
36
- #str_processed.should == str_after
37
35
  expect(str_processed).to eq str_after
38
-
39
36
  end
40
37
  end
41
-
42
- describe "special_chr!" do
38
+
39
+ describe "special_chr" do
43
40
  it "replaces character references with real characters" do
44
41
  str_before = "&nbsp; &lt; &gt; &amp; &quot;"
45
42
  str_after = "  < > & \""
46
- special_chr!(str_before)
47
- expect(str_before).to eq str_after
48
- end
43
+ expect(special_chr(str_before)).to eq str_after
44
+ end
49
45
  end
50
-
51
- describe "chrref_to_utf!" do
46
+
47
+ describe "chrref_to_utf" do
52
48
  it "replaces character references with real characters" do
53
49
  str_before = "&#x266A;"
54
50
  str_after = "♪"
55
- chrref_to_utf!(str_before)
56
- expect(str_before).to eq str_after
51
+ expect(chrref_to_utf(str_before)).to eq str_after
57
52
  end
58
53
  end
59
-
60
- describe "mndash!" do
54
+
55
+ describe "mndash" do
61
56
  it "replaces {mdash}, {ndash}, or {–} with '–'" do
62
57
  str_before = "{mdash} {ndash} {–}"
63
58
  str_after = "– – –"
64
- mndash!(str_before)
65
- expect(str_before).to eq str_after
59
+ expect(mndash(str_before)).to eq str_after
66
60
  end
67
61
  end
68
-
62
+
69
63
  describe "make_reference" do
70
64
  it "replaces <ref> tag with [ref]" do
71
65
  str_before = "<ref> ... </ref>"
72
66
  str_after = "[ref] ... [/ref]"
73
- make_reference!(str_before)
74
- expect(str_before).to eq str_after
75
- end
67
+ expect(make_reference(str_before)).to eq str_after
68
+ end
76
69
  end
77
-
78
- describe "remove_table!" do
70
+
71
+ describe "remove_table" do
79
72
  it "removes table formated parts" do
80
73
  str_before = "{| ... \n{| ... \n ...|}\n ...|}"
81
74
  str_after = ""
82
- remove_table!(str_before)
83
- expect(str_before).to eq str_after
84
- end
85
- end
86
-
87
- # describe "remove_clade" do
88
- # it "removes clade formated parts" do
89
- # str_before = "\{\{clade ... \n ... \n ... \n\}\}"
90
- # str_after = ""
91
- # expect(remove_clade(str_before)).to eq str_after
92
- # end
93
- # end
94
-
95
- describe "remove_hr!" do
75
+ expect(remove_table(str_before)).to eq str_after
76
+ end
77
+ end
78
+
79
+ describe "remove_hr" do
96
80
  it "removes horizontal lines" do
97
81
  str_before = "\n----\n--\n--\n"
98
82
  str_after = "\n\n"
99
- remove_hr!(str_before)
100
- expect(str_before).to eq str_after
101
- end
83
+ expect(remove_hr(str_before)).to eq str_after
84
+ end
102
85
  end
103
86
 
104
- describe "remove_inbetween!" do
87
+ describe "remove_inbetween" do
105
88
  it "removes tags and its contents" do
106
- str_before = "<tag>abc</tag>"
107
- str_after = "abc"
108
- remove_tag!(str_before)
109
- expect(str_before).to eq str_after
110
- str_before = "[tag]def[/tag]"
111
- str_after = "def"
112
- remove_inbetween!(str_before, ['[', ']'])
113
- expect(str_before).to eq str_after
114
- end
115
- end
116
-
117
- describe "remove_directive!" do
89
+ str_before1 = "<tag>abc</tag>"
90
+ str_after1 = "abc"
91
+ expect(remove_tag(str_before1)).to eq str_after1
92
+
93
+ str_before2 = "[tag]def[/tag]"
94
+ str_after2 = "def"
95
+ expect(remove_inbetween(str_before2, ["[", "]"])).to eq str_after2
96
+ end
97
+ end
98
+
99
+ describe "remove_directive" do
118
100
  it "removes directive" do
119
101
  str_before = "__abc__\n __def__"
120
102
  str_after = "\n "
121
- remove_directive!(str_before)
122
- expect(str_before).to eq str_after
123
- end
103
+ expect(remove_directive(str_before)).to eq str_after
104
+ end
124
105
  end
125
106
 
126
- describe "remove_emphasis!" do
107
+ describe "remove_emphasis" do
127
108
  it "removes directive" do
128
109
  str_before = "''abc''\n'''def'''"
129
110
  str_after = "abc\ndef"
130
- remove_emphasis!(str_before)
131
- expect(str_before).to eq str_after
132
- end
111
+ expect(remove_emphasis(str_before)).to eq str_after
112
+ end
133
113
  end
134
-
135
- describe "escape_nowiki!" do
114
+
115
+ describe "escape_nowiki" do
136
116
  it "replaces <nowiki>...</nowiki> with <nowiki-object_id>" do
137
117
  str_before = "<nowiki>[[abc]]</nowiki>def<nowiki>[[ghi]]</nowiki>"
138
118
  str_after = Regexp.new("<nowiki-\\d+>def<nowiki-\\d+>")
139
- escape_nowiki!(str_before)
140
- expect(str_before).to match str_after
119
+ expect(escape_nowiki(str_before)).to match str_after
141
120
  end
142
121
  end
143
122
 
144
- describe "unescape_nowiki!" do
123
+ describe "unescape_nowiki" do
145
124
  it "replaces <nowiki-object_id> with string stored elsewhere" do
146
- @nowikis = {123 => "[[abc]]", 124 => "[[ghi]]"}
125
+ @nowikis = { 123 => "[[abc]]", 124 => "[[ghi]]" }
147
126
  str_before = "<nowiki-123>def<nowiki-124>"
148
127
  str_after = "[[abc]]def[[ghi]]"
149
- unescape_nowiki!(str_before)
150
- expect(str_before).to eq str_after
128
+ expect(unescape_nowiki(str_before)).to eq str_after
151
129
  end
152
130
  end
153
-
154
- describe "process_interwiki_links!" do
131
+
132
+ describe "process_interwiki_links" do
155
133
  it "formats text link and remove brackets" do
156
- a = "[[a b]]"
157
- b = "[[a b|c]]"
158
- c = "[[a|b|c]]"
159
- d = "[[硬口蓋鼻音|[ɲ], /J/]]"
160
- process_interwiki_links!(a)
161
- process_interwiki_links!(b)
162
- process_interwiki_links!(c)
163
- process_interwiki_links!(d)
164
- expect(a).to eq "a b"
165
- expect(b).to eq "c"
166
- expect(c).to eq "b|c"
167
- expect(d).to eq "[ɲ], /J/"
134
+ a1 = "[[a b]]"
135
+ b1 = "[[a b|c]]"
136
+ c1 = "[[a|b|c]]"
137
+ d1 = "[[硬口蓋鼻音|[ɲ], /J/]]"
138
+ a2 = process_interwiki_links(a1)
139
+ b2 = process_interwiki_links(b1)
140
+ c2 = process_interwiki_links(c1)
141
+ d2 = process_interwiki_links(d1)
142
+ expect(a2).to eq "a b"
143
+ expect(b2).to eq "c"
144
+ expect(c2).to eq "b|c"
145
+ expect(d2).to eq "[ɲ], /J/"
168
146
  end
169
147
  end
170
148
 
171
- describe "process_external_links!" do
149
+ describe "process_external_links" do
172
150
  it "formats text link and remove brackets" do
173
- a = "[http://yohasebe.com yohasebe.com]"
174
- b = "[http://yohasebe.com]"
175
- c = "* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}"
176
- process_external_links!(a)
177
- process_external_links!(b)
178
- process_external_links!(c)
179
- expect(a).to eq "yohasebe.com"
180
- expect(b).to eq "http://yohasebe.com"
181
- expect(c).to eq "* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}"
151
+ a1 = "[http://yohasebe.com yohasebe.com]"
152
+ b1 = "[http://yohasebe.com]"
153
+ c1 = "* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}"
154
+ a2 = process_external_links(a1)
155
+ b2 = process_external_links(b1)
156
+ c2 = process_external_links(c1)
157
+ expect(a2).to eq "yohasebe.com"
158
+ expect(b2).to eq "http://yohasebe.com"
159
+ expect(c2).to eq "* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}"
182
160
  end
183
161
  end
184
-
185
- describe "correct_inline_template!" do
162
+
163
+ describe "correct_inline_template" do
186
164
  it "removes brackets and leaving some text" do
187
- str_before = "{{MedalCountry | {{JPN}} }}"
188
- str_after = "JPN"
189
- correct_inline_template!(str_before)
190
- expect(str_before).to eq str_after
191
- str_before = "{{lang|en|Japan}}"
192
- str_after = "Japan"
193
- correct_inline_template!(str_before)
194
- expect(str_before).to eq str_after
195
- str_before = "{{a|b=c|d=f}}"
196
- str_after = "c"
197
- correct_inline_template!(str_before)
198
- expect(str_before).to eq str_after
199
- str_before = "{{a|b|{{c|d|e}}}}"
200
- str_after = "b"
201
- correct_inline_template!(str_before)
202
- expect(str_before).to eq str_after
203
- str_before = "{{要出典範囲|日本人に多く見受けられる|date=2013年8月|title=日本人特有なのか、本当に多いのかを示す必要がある}}"
204
- str_after = "日本人に多く見受けられる"
205
- correct_inline_template!(str_before)
206
- expect(str_before).to eq str_after
165
+ str_before1 = "{{MedalCountry | {{JPN}} }}"
166
+ str_after1 = "JPN"
167
+ expect(correct_inline_template(str_before1)).to eq str_after1
168
+
169
+ str_before2 = "{{lang|en|Japan}}"
170
+ str_after2 = "Japan"
171
+ expect(correct_inline_template(str_before2)).to eq str_after2
172
+
173
+ str_before3 = "{{a|b=c|d=f}}"
174
+ str_after3 = "c"
175
+ expect(correct_inline_template(str_before3)).to eq str_after3
176
+
177
+ str_before4 = "{{a|b|{{c|d|e}}}}"
178
+ str_after4 = "b"
179
+ expect(correct_inline_template(str_before4)).to eq str_after4
180
+
181
+ str_before5 = "{{要出典範囲|日本人に多く見受けられる|date=2013年8月|title=日本人特有なのか、本当に多いのかを示す必要がある}}"
182
+ str_after5 = "日本人に多く見受けられる"
183
+ expect(correct_inline_template(str_before5)).to eq str_after5
207
184
  end
208
185
  end
209
186
  end
data/wp2txt.gemspec CHANGED
@@ -1,6 +1,6 @@
1
- # -*- coding: utf-8 -*-
2
- $:.push File.expand_path("../lib", __FILE__)
3
- require "wp2txt/version"
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/wp2txt/version"
4
4
 
5
5
  Gem::Specification.new do |s|
6
6
  s.name = "wp2txt"
@@ -8,26 +8,24 @@ Gem::Specification.new do |s|
8
8
  s.authors = ["Yoichiro Hasebe"]
9
9
  s.email = ["yohasebe@gmail.com"]
10
10
  s.homepage = "https://github.com/yohasebe/wp2txt"
11
- s.summary = %q{A command-line toolkit to extract text content and category data from Wikipedia dump files}
12
- s.description = %q{WP2TXT extracts text and category data from Wikipedia dump files (encoded in XML / compressed with Bzip2), removing MediaWiki markup and other metadata.}
13
-
11
+ s.summary = "A command-line toolkit to extract text content and category data from Wikipedia dump files"
12
+ s.description = "WP2TXT extracts text and category data from Wikipedia dump files (encoded in XML / compressed with Bzip2), removing MediaWiki markup and other metadata."
14
13
  s.rubyforge_project = "wp2txt"
15
-
16
- s.files = `git ls-files`.split("\n")
14
+ s.license = "MIT"
15
+ s.required_ruby_version = Gem::Requirement.new(">= 2.6")
16
+ s.files = `git ls-files`.split("\n")
17
17
  s.files -= ["data/*", "image/*"]
18
- s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
19
- s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
18
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
19
+ s.executables = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
20
20
  s.require_paths = ["lib"]
21
-
22
- # s.add_development_dependency "bundler"
23
- # s.add_development_dependency "rspec"
24
- # s.add_development_dependency "rake"
25
-
26
- s.add_dependency "nokogiri"
27
- s.add_dependency "ruby-progressbar"
28
- s.add_dependency "parallel"
21
+ s.add_development_dependency "bundler"
22
+ s.add_development_dependency "rake"
23
+ s.add_development_dependency "rspec"
29
24
  s.add_dependency "htmlentities"
25
+ s.add_dependency "nokogiri"
30
26
  s.add_dependency "optimist"
27
+ s.add_dependency "parallel"
31
28
  s.add_dependency "pastel"
29
+ s.add_dependency "ruby-progressbar"
32
30
  s.add_dependency "tty-spinner"
33
31
  end
metadata CHANGED
@@ -1,23 +1,23 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wp2txt
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-08-11 00:00:00.000000000 Z
11
+ date: 2023-01-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: nokogiri
14
+ name: bundler
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
17
  - - ">="
18
18
  - !ruby/object:Gem::Version
19
19
  version: '0'
20
- type: :runtime
20
+ type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
@@ -25,13 +25,13 @@ dependencies:
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
27
  - !ruby/object:Gem::Dependency
28
- name: ruby-progressbar
28
+ name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - ">="
32
32
  - !ruby/object:Gem::Version
33
33
  version: '0'
34
- type: :runtime
34
+ type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
@@ -39,13 +39,13 @@ dependencies:
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
- name: parallel
42
+ name: rspec
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - ">="
46
46
  - !ruby/object:Gem::Version
47
47
  version: '0'
48
- type: :runtime
48
+ type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - ">="
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: nokogiri
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: optimist
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -80,6 +94,20 @@ dependencies:
80
94
  - - ">="
81
95
  - !ruby/object:Gem::Version
82
96
  version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: parallel
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
83
111
  - !ruby/object:Gem::Dependency
84
112
  name: pastel
85
113
  requirement: !ruby/object:Gem::Requirement
@@ -94,6 +122,20 @@ dependencies:
94
122
  - - ">="
95
123
  - !ruby/object:Gem::Version
96
124
  version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: ruby-progressbar
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :runtime
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
97
139
  - !ruby/object:Gem::Dependency
98
140
  name: tty-spinner
99
141
  requirement: !ruby/object:Gem::Requirement
@@ -117,7 +159,12 @@ executables:
117
159
  extensions: []
118
160
  extra_rdoc_files: []
119
161
  files:
162
+ - ".dockerignore"
163
+ - ".github/workflows/ci.yml"
120
164
  - ".gitignore"
165
+ - ".rubocop.yml"
166
+ - ".solargraph.yml"
167
+ - Dockerfile
121
168
  - Gemfile
122
169
  - LICENSE
123
170
  - README.md
@@ -136,13 +183,15 @@ files:
136
183
  - image/wp2txt.svg
137
184
  - lib/wp2txt.rb
138
185
  - lib/wp2txt/article.rb
186
+ - lib/wp2txt/regex.rb
139
187
  - lib/wp2txt/utils.rb
140
188
  - lib/wp2txt/version.rb
141
189
  - spec/spec_helper.rb
142
190
  - spec/utils_spec.rb
143
191
  - wp2txt.gemspec
144
192
  homepage: https://github.com/yohasebe/wp2txt
145
- licenses: []
193
+ licenses:
194
+ - MIT
146
195
  metadata: {}
147
196
  post_install_message:
148
197
  rdoc_options: []
@@ -152,14 +201,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
152
201
  requirements:
153
202
  - - ">="
154
203
  - !ruby/object:Gem::Version
155
- version: '0'
204
+ version: '2.6'
156
205
  required_rubygems_version: !ruby/object:Gem::Requirement
157
206
  requirements:
158
207
  - - ">="
159
208
  - !ruby/object:Gem::Version
160
209
  version: '0'
161
210
  requirements: []
162
- rubygems_version: 3.3.7
211
+ rubygems_version: 3.4.1
163
212
  signing_key:
164
213
  specification_version: 4
165
214
  summary: A command-line toolkit to extract text content and category data from Wikipedia