wp2txt 0.9.5 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/lib/wp2txt.rb CHANGED
@@ -4,42 +4,25 @@
4
4
  $: << File.join(File.dirname(__FILE__))
5
5
 
6
6
  require "nokogiri"
7
- require "parallel"
8
-
9
- require 'etc'
10
- require 'pp'
11
7
  require "wp2txt/article"
12
8
  require "wp2txt/utils"
13
- require "wp2txt/progressbar"
14
- # require "wp2txt/mw_api"
15
-
16
- begin
17
- require "bzip2-ruby"
18
- NO_BZ2 = false
19
- rescue LoadError
20
- # in case bzip2-ruby gem is not available
21
- NO_BZ2 = true
22
- end
23
9
 
24
10
  module Wp2txt
25
- class Runner
26
-
11
+ class Splitter
27
12
  include Wp2txt
28
-
29
- def initialize(parent, input_file, output_dir = ".", tfile_size = 10, num_threads = 1, convert = true, strip_tmarker = false)
30
- @parent = parent
13
+ def initialize(input_file, output_dir = ".", tfile_size = 10, bz2_gem = false)
31
14
  @fp = nil
32
-
33
15
  @input_file = input_file
34
16
  @output_dir = output_dir
35
17
  @tfile_size = tfile_size
36
- @convert = convert
37
- @strip_tmarker = strip_tmarker
38
- num_cores_available = Etc.nprocessors
39
- @num_threads = num_threads <= num_cores_available ? num_threads : num_cores_available
18
+ if bz2_gem
19
+ require "bzip2-ruby"
20
+ end
21
+ @bz2_gem = bz2_gem
22
+ prepare
40
23
  end
41
-
42
- def file_size(file)
24
+
25
+ def file_size(file)
43
26
  origin = Time.now
44
27
  size = 0; unit = 10485760; star = 0; before = Time.now.to_f
45
28
  error_count = 10
@@ -49,7 +32,7 @@ module Wp2txt
49
32
  rescue => e
50
33
  a = nil
51
34
  end
52
- break unless a
35
+ break unless a
53
36
 
54
37
  present = Time.now.to_f
55
38
  size += a.size
@@ -57,88 +40,62 @@ module Wp2txt
57
40
  star = 0 if star > 10
58
41
  star += 1
59
42
  before = present
60
- end
43
+ end
61
44
  end
62
45
  time_elapsed = Time.now - origin
63
46
  size
64
47
  end
65
-
66
- # control the display of command line progressbar (or gui which is not available for now)
67
- def notify_parent(last = false)
68
- @last_time ||= Time.now.to_f
69
- @elapsed_sum ||= 0
70
- time_now = Time.now.to_f
71
- elapsed_from_last = (time_now - @last_time).to_i
72
-
73
- if elapsed_from_last > 0.3 || last
74
48
 
75
- @last_time = time_now
76
- @elapsed_sum += elapsed_from_last
77
- gvalue = (@size_read.to_f / @infile_size.to_f * 100 * 100).to_i
78
- elt_str = sec_to_str(@elapsed_sum)
79
- if last
80
- eta_str = "00:00:00"
81
- else
82
- lines_persec = @size_read / @elapsed_sum if @elapsed_sum > 0
83
- eta_sec = (@infile_size - @size_read) / lines_persec
84
- eta_str = sec_to_str(eta_sec)
85
- end
86
- @parent.prg_update(gvalue, elt_str, eta_str)
49
+ # check if a given command exists: return the path if it does, return false if not
50
+ def command_exist?(command)
51
+ basename = File.basename(command)
52
+ path = ""
53
+ print "Checking #{basename}: "
54
+ if open("| which #{command} 2>/dev/null"){ |f| path = f.gets.strip }
55
+ puts "detected [#{path}]"
56
+ return path.strip
57
+ elsif open("| which #{basename} 2>/dev/null"){ |f| path = f.gets.strip }
58
+ puts "detected [#{path}]"
59
+ return path.strip
60
+ else
61
+ puts "not found"
62
+ return false
87
63
  end
88
64
  end
89
65
 
90
- # check the size of input file (bz2 or plain xml) when uncompressed
66
+ # check the size of input file (bz2 or plain xml) when decompressed
91
67
  def prepare
92
-
93
68
  # if output_dir is not specified, output in the same directory
94
69
  # as the imput file
95
70
  if !@output_dir && @input_file
96
71
  @output_dir = File.dirname(@input_file)
97
72
  end
98
73
 
99
- # if input file is bz2 compressed, use bz2-ruby if available,
100
- # use command line bzip2 program otherwise.
101
74
  if /.bz2$/ =~ @input_file
102
- unless NO_BZ2
103
- file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
104
- @parent.msg("WP2TXT is spawning #{@num_threads} threads to process data \n", 0)
105
- @parent.msg("Preparing ... This may take several minutes or more ", 0)
106
- @infile_size = file_size(file)
107
- @parent.msg("... Done.", 1)
108
- file.close
75
+ if @bz2_gem
109
76
  file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
77
+ elsif RUBY_PLATFORM.index("win32")
78
+ file = IO.popen("bunzip2.exe -c #{@input_file}")
110
79
  else
111
- if RUBY_PLATFORM.index("win32")
112
- file = IO.popen("bunzip2.exe -c #{@input_file}")
113
- else
114
- file = IO.popen("bzip2 -c -d #{@input_file}")
115
- end
116
- @parent.msg("WP2TXT is spawning #{@num_threads} threads to process data \n", 0)
117
- @parent.msg("Preparing ... This may take several minutes or more ", 0)
118
- @infile_size = file_size(file)
119
- @parent.msg("... Done.", 1)
120
- file.close # try to reopen since rewind method is unavailable
121
- if RUBY_PLATFORM.index("win32")
122
- file = IO.popen("bunzip2.exe -c #{@input_file}")
123
- else
124
- file = IO.popen("bzip2 -c -d #{@input_file}")
80
+ if bzpath = command_exist?("lbzip2") ||
81
+ command_exist?("pbzip2") ||
82
+ command_exist?("bzip2")
83
+ file = IO.popen("#{bzpath} -c -d #{@input_file}")
125
84
  end
126
- end
85
+ end
127
86
  else # meaning that it is a text file
128
87
  @infile_size = File.stat(@input_file).size
129
88
  file = open(@input_file)
130
89
  end
131
90
 
132
91
  #create basename of output file
133
- @outfile_base = File.basename(@input_file, ".*") + "-"
92
+ @outfile_base = File.basename(@input_file, ".*") + "-"
134
93
  @total_size = 0
135
94
  @file_index = 1
136
95
  outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
137
96
  @outfiles = []
138
97
  @outfiles << outfilename
139
- @fp = File.open(outfilename, "w")
140
- @parent.before
141
- @parent.data_set(@input_file, 100 * 100)
98
+ @fp = File.open(outfilename, "w")
142
99
  @file_pointer = file
143
100
  return true
144
101
  end
@@ -156,7 +113,110 @@ module Wp2txt
156
113
  # temp_buf is filled with text split by "\n"
157
114
  temp_buf = []
158
115
  ss = StringScanner.new(new_lines)
159
- while ss.scan(/.*?\n/m)
116
+ while ss.scan(/.*?\n/m)
117
+ temp_buf << ss[0]
118
+ end
119
+ temp_buf << ss.rest unless ss.eos?
120
+
121
+ new_first_line = temp_buf.shift
122
+ if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
123
+ @buffer.last << new_first_line
124
+ @buffer << ""
125
+ else
126
+ @buffer.last << new_first_line
127
+ end
128
+ @buffer += temp_buf unless temp_buf.empty?
129
+ if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
130
+ @buffer << ""
131
+ end
132
+ break if @buffer.size > 1
133
+ end
134
+ return true
135
+ end
136
+
137
+ def get_newline
138
+ @buffer ||= [""]
139
+ if @buffer.size == 1
140
+ return nil unless fill_buffer
141
+ end
142
+ if @buffer.empty?
143
+ return nil
144
+ else
145
+ new_line = @buffer.shift
146
+ return new_line
147
+ end
148
+ end
149
+
150
+ def split_file
151
+ output_text = ""
152
+ end_flag = false
153
+ while text = get_newline
154
+ @count ||= 0;@count += 1;
155
+ @size_read ||=0
156
+ @size_read += text.bytesize
157
+ @total_size += text.bytesize
158
+ output_text << text
159
+ end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
160
+ # never close the file until the end of the page even if end_flag is on
161
+ if end_flag && /<\/page/ =~ text
162
+ @fp.puts(output_text)
163
+ output_text = ""
164
+ @total_size = 0
165
+ end_flag = false
166
+ @fp.close
167
+ @file_index += 1
168
+ outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
169
+ @outfiles << outfilename
170
+ @fp = File.open(outfilename, "w")
171
+ next
172
+ end
173
+ end
174
+ @fp.puts(output_text) if output_text != ""
175
+ @fp.close
176
+
177
+ if File.size(outfilename) == 0
178
+ File.delete(outfilename)
179
+ @outfiles.delete(outfilename)
180
+ end
181
+
182
+ rename(@outfiles, "xml")
183
+ end
184
+ end
185
+
186
+ class Runner
187
+ include Wp2txt
188
+
189
+ def initialize(input_file, output_dir = ".", strip_tmarker = false, del_interfile = true)
190
+ @fp = nil
191
+ @input_file = input_file
192
+ @output_dir = output_dir
193
+ @strip_tmarker = strip_tmarker
194
+ @del_interfile = del_interfile
195
+ prepare
196
+ end
197
+
198
+ def prepare
199
+ @infile_size = File.stat(@input_file).size
200
+ file = open(@input_file)
201
+ @file_pointer = file
202
+ @outfile_base = File.basename(@input_file, ".*")
203
+ @total_size = 0
204
+ return true
205
+ end
206
+
207
+ def fill_buffer
208
+ while true do
209
+ begin
210
+ new_lines = @file_pointer.read(10485760)
211
+ rescue => e
212
+ return nil
213
+ end
214
+ return nil unless new_lines
215
+
216
+ # temp_buf is filled with text split by "\n"
217
+ temp_buf = []
218
+ ss = StringScanner.new(new_lines)
219
+ while ss.scan(/.*?\n/m)
160
220
  temp_buf << ss[0]
161
221
  end
162
222
  temp_buf << ss.rest unless ss.eos?
@@ -178,25 +238,22 @@ module Wp2txt
178
238
  end
179
239
 
180
240
  def get_newline
181
- @buffer ||= [""]
241
+ @buffer ||= [""]
182
242
  if @buffer.size == 1
183
243
  return nil unless fill_buffer
184
244
  end
185
245
  if @buffer.empty?
186
246
  return nil
187
- else
247
+ else
188
248
  new_line = @buffer.shift
189
249
  return new_line
190
- end
250
+ end
191
251
  end
192
252
 
193
253
  def get_page
194
254
  inside_page = false
195
255
  page = ""
196
256
  while line = get_newline
197
- notify_parent
198
- @size_read ||=0; @size_read += line.bytesize
199
-
200
257
  if /<page>/ =~ line #
201
258
  page << line
202
259
  inside_page = true
@@ -215,22 +272,7 @@ module Wp2txt
215
272
  end
216
273
  end
217
274
 
218
- # call this method to do the job
219
275
  def extract_text(&block)
220
- prepare
221
- if @convert
222
- if block
223
- extract_and_convert(&block)
224
- else
225
- extract_and_convert
226
- end
227
- else
228
- # output the original xml only split to files of the specified size
229
- extract
230
- end
231
- end
232
-
233
- def extract_and_convert(&block)
234
276
  in_text = false
235
277
  in_message = false
236
278
  result_text = ""
@@ -241,17 +283,15 @@ module Wp2txt
241
283
  pages = []
242
284
  data_empty = false
243
285
 
244
- begin
286
+ while !data_empty
245
287
  page = get_page
246
288
  if page
247
289
  pages << page
248
290
  else
249
291
  data_empty = true
250
292
  end
251
- if data_empty || pages.size == @num_threads
252
- # pages_text = Parallel.map_with_index(pages, in_threads: @num_threads) do |page, n|
253
- pages_text = Parallel.map_with_index(pages, in_threads: @num_threads) do |page, n|
254
- page_text = {:order => n, :data => nil}
293
+ if data_empty
294
+ pages.each do |page|
255
295
  xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
256
296
  xml = xmlns + page + "</mediawiki>"
257
297
 
@@ -270,79 +310,22 @@ module Wp2txt
270
310
  end
271
311
  end
272
312
  article = Article.new(text, title, @strip_tmarker)
273
- page_text[:data] = block.call(article)
313
+ page_text = block.call(article)
314
+ output_text << page_text
274
315
  end
275
- page_text
276
- end
277
- pages.clear
278
- pages_text = pages_text.sort_by{|v| v[:order]}.map{|v| v[:data]}.compact
279
- pages_text.each do |page_text|
280
- output_text << page_text
281
- @count ||= 0; @count += 1;
282
- @total_size = output_text.bytesize
283
- # flagged when data exceeds the size of output file
284
- end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
285
316
  end
286
317
 
287
- #close the present file, then open a new one
288
- if end_flag
289
- cleanup!(output_text)
318
+ cleanup!(output_text)
319
+ if output_text.size > 0
320
+ outfilename = File.join(@output_dir, @outfile_base + ".txt")
321
+ @fp = File.open(outfilename, "w")
290
322
  @fp.puts(output_text)
291
- output_text = ""
292
- @total_size = 0
293
- end_flag = false
294
323
  @fp.close
295
- @file_index += 1
296
- outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
297
- @outfiles << outfilename
298
- @fp = File.open(outfilename, "w")
299
- next
300
324
  end
301
- end
302
- end while !data_empty
303
-
304
- if output_text != ""
305
- cleanup!(output_text)
306
- @fp.puts(output_text)
307
- end
308
- notify_parent(true)
309
- @parent.after
310
- @fp.close
311
- rename(@outfiles)
312
- @parent.msg("Processing finished", 1)
313
- end
314
-
315
- def extract
316
- output_text = ""
317
- end_flag = false
318
- while text = get_newline
319
- @count ||= 0;@count += 1;
320
- @size_read ||=0;@size_read += text.bytesize
321
- @total_size += text.bytesize
322
- output_text << text
323
- end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
324
- notify_parent
325
- # never close the file until the end of the page even if end_flag is on
326
- if end_flag && /<\/page/ =~ text
327
- @fp.puts(output_text)
325
+ File.delete(@input_file) if @del_interfile
328
326
  output_text = ""
329
- @total_size = 0
330
- end_flag = false
331
- @fp.close
332
- @file_index += 1
333
- outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
334
- @outfiles << outfilename
335
- @fp = File.open(outfilename, "w")
336
- next
337
327
  end
338
328
  end
339
- @fp.puts(output_text) if output_text != ""
340
- notify_parent(true)
341
- @parent.after
342
- @fp.close
343
- rename(@outfiles)
344
- @parent.msg("Processing finished", 1)
345
- end
329
+ end
346
330
  end
347
331
  end
348
-
data/spec/utils_spec.rb CHANGED
@@ -184,10 +184,6 @@ describe "Wp2txt" do
184
184
 
185
185
  describe "correct_inline_template!" do
186
186
  it "removes brackets and leaving some text" do
187
- # str_before = "{{}}"
188
- # str_after = ""
189
- # correct_inline_template!(str_before)
190
- # expect(str_before).to eq str_after
191
187
  str_before = "{{MedalCountry | {{JPN}} }}"
192
188
  str_after = "JPN"
193
189
  correct_inline_template!(str_before)
@@ -197,11 +193,11 @@ describe "Wp2txt" do
197
193
  correct_inline_template!(str_before)
198
194
  expect(str_before).to eq str_after
199
195
  str_before = "{{a|b=c|d=f}}"
200
- str_after = "a"
196
+ str_after = "c"
201
197
  correct_inline_template!(str_before)
202
198
  expect(str_before).to eq str_after
203
199
  str_before = "{{a|b|{{c|d|e}}}}"
204
- str_after = "e"
200
+ str_after = "b"
205
201
  correct_inline_template!(str_before)
206
202
  expect(str_before).to eq str_after
207
203
  str_before = "{{要出典範囲|日本人に多く見受けられる|date=2013年8月|title=日本人特有なのか、本当に多いのかを示す必要がある}}"
@@ -210,18 +206,4 @@ describe "Wp2txt" do
210
206
  expect(str_before).to eq str_after
211
207
  end
212
208
  end
213
-
214
- # describe "expand_template" do
215
- # it "gets data corresponding to a given template using mediawiki api" do
216
- # uri = "http://en.wiktionary.org/w/api.php"
217
- # template = "{{en-verb}}"
218
- # word = "kick"
219
- # expanded = expand_template(uri, template, word)
220
- # html =<<EOD
221
- # <span class=\"infl-inline\"><b class=\"Latn \" lang=\"en\">kick</b> (''third-person singular simple present'' <span class=\"form-of third-person-singular-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicks#English|kicks]]</span>'''</span>, ''present participle'' <span class=\"form-of present-participle-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicking#English|kicking]]</span>'''</span>, ''simple past and past participle'' <span class=\"form-of simple-past-and-participle-form-of\"> '''<span class=\"Latn \" lang=\"en\">[[kicked#English|kicked]]</span>'''</span>)</span>[[Category:English verbs|kick]]
222
- # EOD
223
- # html.strip!
224
- # expanded.should == html
225
- # end
226
- # end
227
- end
209
+ end
data/wp2txt.gemspec CHANGED
@@ -7,13 +7,14 @@ Gem::Specification.new do |s|
7
7
  s.version = Wp2txt::VERSION
8
8
  s.authors = ["Yoichiro Hasebe"]
9
9
  s.email = ["yohasebe@gmail.com"]
10
- s.homepage = "http://github.com/yohasebe/wp2txt"
11
- s.summary = %q{Wikipedia dump to text converter}
12
- s.description = %q{WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata.}
10
+ s.homepage = "https://github.com/yohasebe/wp2txt"
11
+ s.summary = %q{A command-line toolkit to extract text content and category data from Wikipedia dump files}
12
+ s.description = %q{WP2TXT extracts text and category data from Wikipedia dump files (encoded in XML / compressed with Bzip2), removing MediaWiki markup and other metadata.}
13
13
 
14
14
  s.rubyforge_project = "wp2txt"
15
15
 
16
16
  s.files = `git ls-files`.split("\n")
17
+ s.files -= ["data/*", "image/*"]
17
18
  s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
19
  s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
20
  s.require_paths = ["lib"]
@@ -23,7 +24,10 @@ Gem::Specification.new do |s|
23
24
  # s.add_development_dependency "rake"
24
25
 
25
26
  s.add_dependency "nokogiri"
27
+ s.add_dependency "ruby-progressbar"
26
28
  s.add_dependency "parallel"
27
29
  s.add_dependency "htmlentities"
28
30
  s.add_dependency "optimist"
31
+ s.add_dependency "pastel"
32
+ s.add_dependency "tty-spinner"
29
33
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wp2txt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.5
4
+ version: 1.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-08-02 00:00:00.000000000 Z
11
+ date: 2022-08-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: ruby-progressbar
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: parallel
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -66,12 +80,39 @@ dependencies:
66
80
  - - ">="
67
81
  - !ruby/object:Gem::Version
68
82
  version: '0'
69
- description: WP2TXT extracts plain text data from Wikipedia dump file (encoded in
70
- XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata.
83
+ - !ruby/object:Gem::Dependency
84
+ name: pastel
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: tty-spinner
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ description: WP2TXT extracts text and category data from Wikipedia dump files (encoded
112
+ in XML / compressed with Bzip2), removing MediaWiki markup and other metadata.
71
113
  email:
72
114
  - yohasebe@gmail.com
73
115
  executables:
74
- - benchmark.rb
75
116
  - wp2txt
76
117
  extensions: []
77
118
  extra_rdoc_files: []
@@ -81,26 +122,26 @@ files:
81
122
  - LICENSE
82
123
  - README.md
83
124
  - Rakefile
84
- - bin/benchmark.rb
85
125
  - bin/wp2txt
86
126
  - data/output_samples/testdata_en.txt
87
- - data/output_samples/testdata_en_categories.txt
127
+ - data/output_samples/testdata_en_category.txt
88
128
  - data/output_samples/testdata_en_summary.txt
89
129
  - data/output_samples/testdata_ja.txt
90
- - data/output_samples/testdata_ja_categories.txt
130
+ - data/output_samples/testdata_ja_category.txt
91
131
  - data/output_samples/testdata_ja_summary.txt
92
132
  - data/testdata_en.bz2
93
133
  - data/testdata_ja.bz2
134
+ - image/screenshot.png
135
+ - image/wp2txt-logo.svg
136
+ - image/wp2txt.svg
94
137
  - lib/wp2txt.rb
95
138
  - lib/wp2txt/article.rb
96
- - lib/wp2txt/mw_api.rb
97
- - lib/wp2txt/progressbar.rb
98
139
  - lib/wp2txt/utils.rb
99
140
  - lib/wp2txt/version.rb
100
141
  - spec/spec_helper.rb
101
142
  - spec/utils_spec.rb
102
143
  - wp2txt.gemspec
103
- homepage: http://github.com/yohasebe/wp2txt
144
+ homepage: https://github.com/yohasebe/wp2txt
104
145
  licenses: []
105
146
  metadata: {}
106
147
  post_install_message:
@@ -121,7 +162,8 @@ requirements: []
121
162
  rubygems_version: 3.3.7
122
163
  signing_key:
123
164
  specification_version: 4
124
- summary: Wikipedia dump to text converter
165
+ summary: A command-line toolkit to extract text content and category data from Wikipedia
166
+ dump files
125
167
  test_files:
126
168
  - spec/spec_helper.rb
127
169
  - spec/utils_spec.rb