wp2txt 0.9.5 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/wp2txt.rb CHANGED
@@ -4,42 +4,25 @@
4
4
  $: << File.join(File.dirname(__FILE__))
5
5
 
6
6
  require "nokogiri"
7
- require "parallel"
8
-
9
- require 'etc'
10
- require 'pp'
11
7
  require "wp2txt/article"
12
8
  require "wp2txt/utils"
13
- require "wp2txt/progressbar"
14
- # require "wp2txt/mw_api"
15
-
16
- begin
17
- require "bzip2-ruby"
18
- NO_BZ2 = false
19
- rescue LoadError
20
- # in case bzip2-ruby gem is not available
21
- NO_BZ2 = true
22
- end
23
9
 
24
10
  module Wp2txt
25
- class Runner
26
-
11
+ class Splitter
27
12
  include Wp2txt
28
-
29
- def initialize(parent, input_file, output_dir = ".", tfile_size = 10, num_threads = 1, convert = true, strip_tmarker = false)
30
- @parent = parent
13
+ def initialize(input_file, output_dir = ".", tfile_size = 10, bz2_gem = false)
31
14
  @fp = nil
32
-
33
15
  @input_file = input_file
34
16
  @output_dir = output_dir
35
17
  @tfile_size = tfile_size
36
- @convert = convert
37
- @strip_tmarker = strip_tmarker
38
- num_cores_available = Etc.nprocessors
39
- @num_threads = num_threads <= num_cores_available ? num_threads : num_cores_available
18
+ if bz2_gem
19
+ require "bzip2-ruby"
20
+ end
21
+ @bz2_gem = bz2_gem
22
+ prepare
40
23
  end
41
-
42
- def file_size(file)
24
+
25
+ def file_size(file)
43
26
  origin = Time.now
44
27
  size = 0; unit = 10485760; star = 0; before = Time.now.to_f
45
28
  error_count = 10
@@ -49,7 +32,7 @@ module Wp2txt
49
32
  rescue => e
50
33
  a = nil
51
34
  end
52
- break unless a
35
+ break unless a
53
36
 
54
37
  present = Time.now.to_f
55
38
  size += a.size
@@ -57,88 +40,62 @@ module Wp2txt
57
40
  star = 0 if star > 10
58
41
  star += 1
59
42
  before = present
60
- end
43
+ end
61
44
  end
62
45
  time_elapsed = Time.now - origin
63
46
  size
64
47
  end
65
-
66
- # control the display of command line progressbar (or gui which is not available for now)
67
- def notify_parent(last = false)
68
- @last_time ||= Time.now.to_f
69
- @elapsed_sum ||= 0
70
- time_now = Time.now.to_f
71
- elapsed_from_last = (time_now - @last_time).to_i
72
-
73
- if elapsed_from_last > 0.3 || last
74
48
 
75
- @last_time = time_now
76
- @elapsed_sum += elapsed_from_last
77
- gvalue = (@size_read.to_f / @infile_size.to_f * 100 * 100).to_i
78
- elt_str = sec_to_str(@elapsed_sum)
79
- if last
80
- eta_str = "00:00:00"
81
- else
82
- lines_persec = @size_read / @elapsed_sum if @elapsed_sum > 0
83
- eta_sec = (@infile_size - @size_read) / lines_persec
84
- eta_str = sec_to_str(eta_sec)
85
- end
86
- @parent.prg_update(gvalue, elt_str, eta_str)
49
+ # check if a given command exists: return the path if it does, return false if not
50
+ def command_exist?(command)
51
+ basename = File.basename(command)
52
+ path = ""
53
+ print "Checking #{basename}: "
54
+ if open("| which #{command} 2>/dev/null"){ |f| path = f.gets.strip }
55
+ puts "detected [#{path}]"
56
+ return path.strip
57
+ elsif open("| which #{basename} 2>/dev/null"){ |f| path = f.gets.strip }
58
+ puts "detected [#{path}]"
59
+ return path.strip
60
+ else
61
+ puts "not found"
62
+ return false
87
63
  end
88
64
  end
89
65
 
90
- # check the size of input file (bz2 or plain xml) when uncompressed
66
+ # check the size of input file (bz2 or plain xml) when decompressed
91
67
  def prepare
92
-
93
68
  # if output_dir is not specified, output in the same directory
94
69
  # as the imput file
95
70
  if !@output_dir && @input_file
96
71
  @output_dir = File.dirname(@input_file)
97
72
  end
98
73
 
99
- # if input file is bz2 compressed, use bz2-ruby if available,
100
- # use command line bzip2 program otherwise.
101
74
  if /.bz2$/ =~ @input_file
102
- unless NO_BZ2
103
- file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
104
- @parent.msg("WP2TXT is spawning #{@num_threads} threads to process data \n", 0)
105
- @parent.msg("Preparing ... This may take several minutes or more ", 0)
106
- @infile_size = file_size(file)
107
- @parent.msg("... Done.", 1)
108
- file.close
75
+ if @bz2_gem
109
76
  file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
77
+ elsif RUBY_PLATFORM.index("win32")
78
+ file = IO.popen("bunzip2.exe -c #{@input_file}")
110
79
  else
111
- if RUBY_PLATFORM.index("win32")
112
- file = IO.popen("bunzip2.exe -c #{@input_file}")
113
- else
114
- file = IO.popen("bzip2 -c -d #{@input_file}")
115
- end
116
- @parent.msg("WP2TXT is spawning #{@num_threads} threads to process data \n", 0)
117
- @parent.msg("Preparing ... This may take several minutes or more ", 0)
118
- @infile_size = file_size(file)
119
- @parent.msg("... Done.", 1)
120
- file.close # try to reopen since rewind method is unavailable
121
- if RUBY_PLATFORM.index("win32")
122
- file = IO.popen("bunzip2.exe -c #{@input_file}")
123
- else
124
- file = IO.popen("bzip2 -c -d #{@input_file}")
80
+ if bzpath = command_exist?("lbzip2") ||
81
+ command_exist?("pbzip2") ||
82
+ command_exist?("bzip2")
83
+ file = IO.popen("#{bzpath} -c -d #{@input_file}")
125
84
  end
126
- end
85
+ end
127
86
  else # meaning that it is a text file
128
87
  @infile_size = File.stat(@input_file).size
129
88
  file = open(@input_file)
130
89
  end
131
90
 
132
91
  #create basename of output file
133
- @outfile_base = File.basename(@input_file, ".*") + "-"
92
+ @outfile_base = File.basename(@input_file, ".*") + "-"
134
93
  @total_size = 0
135
94
  @file_index = 1
136
95
  outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
137
96
  @outfiles = []
138
97
  @outfiles << outfilename
139
- @fp = File.open(outfilename, "w")
140
- @parent.before
141
- @parent.data_set(@input_file, 100 * 100)
98
+ @fp = File.open(outfilename, "w")
142
99
  @file_pointer = file
143
100
  return true
144
101
  end
@@ -156,7 +113,110 @@ module Wp2txt
156
113
  # temp_buf is filled with text split by "\n"
157
114
  temp_buf = []
158
115
  ss = StringScanner.new(new_lines)
159
- while ss.scan(/.*?\n/m)
116
+ while ss.scan(/.*?\n/m)
117
+ temp_buf << ss[0]
118
+ end
119
+ temp_buf << ss.rest unless ss.eos?
120
+
121
+ new_first_line = temp_buf.shift
122
+ if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
123
+ @buffer.last << new_first_line
124
+ @buffer << ""
125
+ else
126
+ @buffer.last << new_first_line
127
+ end
128
+ @buffer += temp_buf unless temp_buf.empty?
129
+ if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
130
+ @buffer << ""
131
+ end
132
+ break if @buffer.size > 1
133
+ end
134
+ return true
135
+ end
136
+
137
+ def get_newline
138
+ @buffer ||= [""]
139
+ if @buffer.size == 1
140
+ return nil unless fill_buffer
141
+ end
142
+ if @buffer.empty?
143
+ return nil
144
+ else
145
+ new_line = @buffer.shift
146
+ return new_line
147
+ end
148
+ end
149
+
150
+ def split_file
151
+ output_text = ""
152
+ end_flag = false
153
+ while text = get_newline
154
+ @count ||= 0;@count += 1;
155
+ @size_read ||=0
156
+ @size_read += text.bytesize
157
+ @total_size += text.bytesize
158
+ output_text << text
159
+ end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
160
+ # never close the file until the end of the page even if end_flag is on
161
+ if end_flag && /<\/page/ =~ text
162
+ @fp.puts(output_text)
163
+ output_text = ""
164
+ @total_size = 0
165
+ end_flag = false
166
+ @fp.close
167
+ @file_index += 1
168
+ outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
169
+ @outfiles << outfilename
170
+ @fp = File.open(outfilename, "w")
171
+ next
172
+ end
173
+ end
174
+ @fp.puts(output_text) if output_text != ""
175
+ @fp.close
176
+
177
+ if File.size(outfilename) == 0
178
+ File.delete(outfilename)
179
+ @outfiles.delete(outfilename)
180
+ end
181
+
182
+ rename(@outfiles, "xml")
183
+ end
184
+ end
185
+
186
+ class Runner
187
+ include Wp2txt
188
+
189
+ def initialize(input_file, output_dir = ".", strip_tmarker = false, del_interfile = true)
190
+ @fp = nil
191
+ @input_file = input_file
192
+ @output_dir = output_dir
193
+ @strip_tmarker = strip_tmarker
194
+ @del_interfile = del_interfile
195
+ prepare
196
+ end
197
+
198
+ def prepare
199
+ @infile_size = File.stat(@input_file).size
200
+ file = open(@input_file)
201
+ @file_pointer = file
202
+ @outfile_base = File.basename(@input_file, ".*")
203
+ @total_size = 0
204
+ return true
205
+ end
206
+
207
+ def fill_buffer
208
+ while true do
209
+ begin
210
+ new_lines = @file_pointer.read(10485760)
211
+ rescue => e
212
+ return nil
213
+ end
214
+ return nil unless new_lines
215
+
216
+ # temp_buf is filled with text split by "\n"
217
+ temp_buf = []
218
+ ss = StringScanner.new(new_lines)
219
+ while ss.scan(/.*?\n/m)
160
220
  temp_buf << ss[0]
161
221
  end
162
222
  temp_buf << ss.rest unless ss.eos?
@@ -178,25 +238,22 @@ module Wp2txt
178
238
  end
179
239
 
180
240
  def get_newline
181
- @buffer ||= [""]
241
+ @buffer ||= [""]
182
242
  if @buffer.size == 1
183
243
  return nil unless fill_buffer
184
244
  end
185
245
  if @buffer.empty?
186
246
  return nil
187
- else
247
+ else
188
248
  new_line = @buffer.shift
189
249
  return new_line
190
- end
250
+ end
191
251
  end
192
252
 
193
253
  def get_page
194
254
  inside_page = false
195
255
  page = ""
196
256
  while line = get_newline
197
- notify_parent
198
- @size_read ||=0; @size_read += line.bytesize
199
-
200
257
  if /<page>/ =~ line #
201
258
  page << line
202
259
  inside_page = true
@@ -215,22 +272,7 @@ module Wp2txt
215
272
  end
216
273
  end
217
274
 
218
- # call this method to do the job
219
275
  def extract_text(&block)
220
- prepare
221
- if @convert
222
- if block
223
- extract_and_convert(&block)
224
- else
225
- extract_and_convert
226
- end
227
- else
228
- # output the original xml only split to files of the specified size
229
- extract
230
- end
231
- end
232
-
233
- def extract_and_convert(&block)
234
276
  in_text = false
235
277
  in_message = false
236
278
  result_text = ""
@@ -241,17 +283,15 @@ module Wp2txt
241
283
  pages = []
242
284
  data_empty = false
243
285
 
244
- begin
286
+ while !data_empty
245
287
  page = get_page
246
288
  if page
247
289
  pages << page
248
290
  else
249
291
  data_empty = true
250
292
  end
251
- if data_empty || pages.size == @num_threads
252
- # pages_text = Parallel.map_with_index(pages, in_threads: @num_threads) do |page, n|
253
- pages_text = Parallel.map_with_index(pages, in_threads: @num_threads) do |page, n|
254
- page_text = {:order => n, :data => nil}
293
+ if data_empty
294
+ pages.each do |page|
255
295
  xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
256
296
  xml = xmlns + page + "</mediawiki>"
257
297
 
@@ -270,79 +310,22 @@ module Wp2txt
270
310
  end
271
311
  end
272
312
  article = Article.new(text, title, @strip_tmarker)
273
- page_text[:data] = block.call(article)
313
+ page_text = block.call(article)
314
+ output_text << page_text
274
315
  end
275
- page_text
276
- end
277
- pages.clear
278
- pages_text = pages_text.sort_by{|v| v[:order]}.map{|v| v[:data]}.compact
279
- pages_text.each do |page_text|
280
- output_text << page_text
281
- @count ||= 0; @count += 1;
282
- @total_size = output_text.bytesize
283
- # flagged when data exceeds the size of output file
284
- end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
285
316
  end
286
317
 
287
- #close the present file, then open a new one
288
- if end_flag
289
- cleanup!(output_text)
318
+ cleanup!(output_text)
319
+ if output_text.size > 0
320
+ outfilename = File.join(@output_dir, @outfile_base + ".txt")
321
+ @fp = File.open(outfilename, "w")
290
322
  @fp.puts(output_text)
291
- output_text = ""
292
- @total_size = 0
293
- end_flag = false
294
323
  @fp.close
295
- @file_index += 1
296
- outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
297
- @outfiles << outfilename
298
- @fp = File.open(outfilename, "w")
299
- next
300
324
  end
301
- end
302
- end while !data_empty
303
-
304
- if output_text != ""
305
- cleanup!(output_text)
306
- @fp.puts(output_text)
307
- end
308
- notify_parent(true)
309
- @parent.after
310
- @fp.close
311
- rename(@outfiles)
312
- @parent.msg("Processing finished", 1)
313
- end
314
-
315
- def extract
316
- output_text = ""
317
- end_flag = false
318
- while text = get_newline
319
- @count ||= 0;@count += 1;
320
- @size_read ||=0;@size_read += text.bytesize
321
- @total_size += text.bytesize
322
- output_text << text
323
- end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
324
- notify_parent
325
- # never close the file until the end of the page even if end_flag is on
326
- if end_flag && /<\/page/ =~ text
327
- @fp.puts(output_text)
325
+ File.delete(@input_file) if @del_interfile
328
326
  output_text = ""
329
- @total_size = 0
330
- end_flag = false
331
- @fp.close
332
- @file_index += 1
333
- outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
334
- @outfiles << outfilename
335
- @fp = File.open(outfilename, "w")
336
- next
337
327
  end
338
328
  end
339
- @fp.puts(output_text) if output_text != ""
340
- notify_parent(true)
341
- @parent.after
342
- @fp.close
343
- rename(@outfiles)
344
- @parent.msg("Processing finished", 1)
345
- end
329
+ end
346
330
  end
347
331
  end
348
-
data/spec/utils_spec.rb CHANGED
@@ -184,10 +184,6 @@ describe "Wp2txt" do
184
184
 
185
185
  describe "correct_inline_template!" do
186
186
  it "removes brackets and leaving some text" do
187
- # str_before = "{{}}"
188
- # str_after = ""
189
- # correct_inline_template!(str_before)
190
- # expect(str_before).to eq str_after
191
187
  str_before = "{{MedalCountry | {{JPN}} }}"
192
188
  str_after = "JPN"
193
189
  correct_inline_template!(str_before)
@@ -197,11 +193,11 @@ describe "Wp2txt" do
197
193
  correct_inline_template!(str_before)
198
194
  expect(str_before).to eq str_after
199
195
  str_before = "{{a|b=c|d=f}}"
200
- str_after = "a"
196
+ str_after = "c"
201
197
  correct_inline_template!(str_before)
202
198
  expect(str_before).to eq str_after
203
199
  str_before = "{{a|b|{{c|d|e}}}}"
204
- str_after = "e"
200
+ str_after = "b"
205
201
  correct_inline_template!(str_before)
206
202
  expect(str_before).to eq str_after
207
203
  str_before = "{{要出典範囲|日本人に多く見受けられる|date=2013年8月|title=日本人特有なのか、本当に多いのかを示す必要がある}}"
@@ -210,18 +206,4 @@ describe "Wp2txt" do
210
206
  expect(str_before).to eq str_after
211
207
  end
212
208
  end
213
-
214
- # describe "expand_template" do
215
- # it "gets data corresponding to a given template using mediawiki api" do
216
- # uri = "http://en.wiktionary.org/w/api.php"
217
- # template = "{{en-verb}}"
218
- # word = "kick"
219
- # expanded = expand_template(uri, template, word)
220
- # html =<<EOD
221
- # <span class=\"infl-inline\"><b class=\"Latn \" lang=\"en\">kick</b> (''third-person singular simple present'' <span class=\"form-of third-person-singular-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicks#English|kicks]]</span>'''</span>, ''present participle'' <span class=\"form-of present-participle-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicking#English|kicking]]</span>'''</span>, ''simple past and past participle'' <span class=\"form-of simple-past-and-participle-form-of\"> '''<span class=\"Latn \" lang=\"en\">[[kicked#English|kicked]]</span>'''</span>)</span>[[Category:English verbs|kick]]
222
- # EOD
223
- # html.strip!
224
- # expanded.should == html
225
- # end
226
- # end
227
- end
209
+ end
data/wp2txt.gemspec CHANGED
@@ -7,13 +7,14 @@ Gem::Specification.new do |s|
7
7
  s.version = Wp2txt::VERSION
8
8
  s.authors = ["Yoichiro Hasebe"]
9
9
  s.email = ["yohasebe@gmail.com"]
10
- s.homepage = "http://github.com/yohasebe/wp2txt"
11
- s.summary = %q{Wikipedia dump to text converter}
12
- s.description = %q{WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata.}
10
+ s.homepage = "https://github.com/yohasebe/wp2txt"
11
+ s.summary = %q{A command-line toolkit to extract text content and category data from Wikipedia dump files}
12
+ s.description = %q{WP2TXT extracts text and category data from Wikipedia dump files (encoded in XML / compressed with Bzip2), removing MediaWiki markup and other metadata.}
13
13
 
14
14
  s.rubyforge_project = "wp2txt"
15
15
 
16
16
  s.files = `git ls-files`.split("\n")
17
+ s.files -= ["data/*", "image/*"]
17
18
  s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
19
  s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
20
  s.require_paths = ["lib"]
@@ -23,7 +24,10 @@ Gem::Specification.new do |s|
23
24
  # s.add_development_dependency "rake"
24
25
 
25
26
  s.add_dependency "nokogiri"
27
+ s.add_dependency "ruby-progressbar"
26
28
  s.add_dependency "parallel"
27
29
  s.add_dependency "htmlentities"
28
30
  s.add_dependency "optimist"
31
+ s.add_dependency "pastel"
32
+ s.add_dependency "tty-spinner"
29
33
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wp2txt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.5
4
+ version: 1.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-08-02 00:00:00.000000000 Z
11
+ date: 2022-08-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: ruby-progressbar
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: parallel
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -66,12 +80,39 @@ dependencies:
66
80
  - - ">="
67
81
  - !ruby/object:Gem::Version
68
82
  version: '0'
69
- description: WP2TXT extracts plain text data from Wikipedia dump file (encoded in
70
- XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata.
83
+ - !ruby/object:Gem::Dependency
84
+ name: pastel
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: tty-spinner
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ description: WP2TXT extracts text and category data from Wikipedia dump files (encoded
112
+ in XML / compressed with Bzip2), removing MediaWiki markup and other metadata.
71
113
  email:
72
114
  - yohasebe@gmail.com
73
115
  executables:
74
- - benchmark.rb
75
116
  - wp2txt
76
117
  extensions: []
77
118
  extra_rdoc_files: []
@@ -81,26 +122,26 @@ files:
81
122
  - LICENSE
82
123
  - README.md
83
124
  - Rakefile
84
- - bin/benchmark.rb
85
125
  - bin/wp2txt
86
126
  - data/output_samples/testdata_en.txt
87
- - data/output_samples/testdata_en_categories.txt
127
+ - data/output_samples/testdata_en_category.txt
88
128
  - data/output_samples/testdata_en_summary.txt
89
129
  - data/output_samples/testdata_ja.txt
90
- - data/output_samples/testdata_ja_categories.txt
130
+ - data/output_samples/testdata_ja_category.txt
91
131
  - data/output_samples/testdata_ja_summary.txt
92
132
  - data/testdata_en.bz2
93
133
  - data/testdata_ja.bz2
134
+ - image/screenshot.png
135
+ - image/wp2txt-logo.svg
136
+ - image/wp2txt.svg
94
137
  - lib/wp2txt.rb
95
138
  - lib/wp2txt/article.rb
96
- - lib/wp2txt/mw_api.rb
97
- - lib/wp2txt/progressbar.rb
98
139
  - lib/wp2txt/utils.rb
99
140
  - lib/wp2txt/version.rb
100
141
  - spec/spec_helper.rb
101
142
  - spec/utils_spec.rb
102
143
  - wp2txt.gemspec
103
- homepage: http://github.com/yohasebe/wp2txt
144
+ homepage: https://github.com/yohasebe/wp2txt
104
145
  licenses: []
105
146
  metadata: {}
106
147
  post_install_message:
@@ -121,7 +162,8 @@ requirements: []
121
162
  rubygems_version: 3.3.7
122
163
  signing_key:
123
164
  specification_version: 4
124
- summary: Wikipedia dump to text converter
165
+ summary: A command-line toolkit to extract text content and category data from Wikipedia
166
+ dump files
125
167
  test_files:
126
168
  - spec/spec_helper.rb
127
169
  - spec/utils_spec.rb