wp2txt 1.1.0 → 1.1.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 62f1e8d6ab1932f3ae3c34fb71930b7e73500c832481dcea6288742c38850a79
4
- data.tar.gz: f0ff0a5488b635b828338d41029c5ad191a0c88282c0fa294a9facf2d93c055b
3
+ metadata.gz: 5e8cda1ff32863bb95b4b314782e46007eabcd784938b4fe33f6660445a37d31
4
+ data.tar.gz: 805bd29ba8b660e705156bf7a4cc1d006b2f43a2c81c24e86ef210ef1fd1ef16
5
5
  SHA512:
6
- metadata.gz: 7bca85758e88d53dcd33fe43e83a251624f89329a2cb55ffb97b41141bcf8fe5ace7c48e3b8e49f5aa42f84724247cfe4ad376238a949e9154876d4d07469afe
7
- data.tar.gz: de59399d5163afed2947e0802abf2e0365894d566c8a1f11823bc901d4948346e7af47d6fba558387f5af7e1301a6725a51a322ac1cd4810264dc3003e0729e2
6
+ metadata.gz: fe798d5ab55cefd55f776e4d0f975cb510a7c9c65af348ba216365827266d808b65f9125d9bc50c21cb05349ae71d07a46a80998ffd000bbcaac71b2eed15e45
7
+ data.tar.gz: 94f8df87a935b52d19f05adca27a01f64787dfa35fed067dc68cc1204b4b0022411f6cb6db1d2c9175987d27113e506f84d1761e4661c7d4f65f934c6ee1647e
data/README.md CHANGED
@@ -8,6 +8,15 @@ WP2TXT extracts text and category data from Wikipedia dump files (encoded in XML
8
8
 
9
9
  ## Changelog
10
10
 
11
+ **April 2023**
12
+
13
+ - File split/delete issues fixed
14
+
15
+ **January 2023**
16
+
17
+ - Bug related to command line arguments fixed
18
+ - Code cleanup introducing Rubocop
19
+
11
20
  **December 2022**
12
21
 
13
22
  - Docker images available via Docker Hub
@@ -93,7 +102,7 @@ Download the latest Wikipedia dump file for the desired language at a URL such a
93
102
 
94
103
  https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
95
104
 
96
- Here, `enwiki` refers to the English Wikipedia. To get the Japanese Wikipedia dump file, for instance, change this to jawiki (Japanese). In doing so, note that there are two instances of `enwiki` in the URL above.
105
+ Here, `enwiki` refers to the English Wikipedia. To get the Japanese Wikipedia dump file, for instance, change this to `jawiki` (Japanese). In doing so, note that there are two instances of `enwiki` in the URL above.
97
106
 
98
107
  Alternatively, you can also select Wikipedia dump files created on a specific date from [here](http://dumps.wikimedia.org/backup-index.html). Make sure to download a file named in the following format:
99
108
 
@@ -213,11 +222,11 @@ The author will appreciate your mentioning one of these in your research.
213
222
  Or use this BibTeX entry:
214
223
 
215
224
  ```
216
- @misc{wp2txt_2022,
225
+ @misc{wp2txt_2023,
217
226
  author = {Yoichiro Hasebe},
218
227
  title = {WP2TXT: A command-line toolkit to extract text content and category data from Wikipedia dump files},
219
228
  url = {https://github.com/yohasebe/wp2txt},
220
- year = {2022}
229
+ year = {2023}
221
230
  }
222
231
  ```
223
232
 
data/bin/wp2txt CHANGED
@@ -3,8 +3,6 @@
3
3
  # frozen_string_literal: true
4
4
 
5
5
  DEBUG_MODE = false
6
- SHAREDIR = File.join(File.dirname(__FILE__), "..", "share")
7
- DOCDIR = File.join(File.dirname(__FILE__), "..", "doc")
8
6
 
9
7
  require_relative "../lib/wp2txt"
10
8
  require_relative "../lib/wp2txt/utils"
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Wp2txt
4
- VERSION = "1.1.0"
4
+ VERSION = "1.1.2"
5
5
  end
data/lib/wp2txt.rb CHANGED
@@ -48,14 +48,19 @@ module Wp2txt
48
48
  basename = File.basename(command)
49
49
  path = +""
50
50
  print "Checking #{basename}: "
51
- if open("| which #{command} 2>/dev/null") { |f| path = f.gets.strip }
52
- puts "detected [#{path}]"
53
- path.strip
54
- elsif open("| which #{basename} 2>/dev/null") { |f| path = f.gets.strip }
55
- puts "detected [#{path}]"
56
- path.strip
57
- else
58
- puts "not found"
51
+ begin
52
+ if open("| which #{command} 2>/dev/null") { |f| path = f.gets.strip }
53
+ puts "detected [#{path}]"
54
+ path.strip
55
+ elsif open("| which #{basename} 2>/dev/null") { |f| path = f.gets.strip }
56
+ puts "detected [#{path}]"
57
+ path.strip
58
+ else
59
+ puts "#{basename} not found"
60
+ false
61
+ end
62
+ rescue StandardError
63
+ puts "#{basename} not found"
59
64
  false
60
65
  end
61
66
  end
@@ -69,7 +74,7 @@ module Wp2txt
69
74
  if /.bz2$/ =~ @input_file
70
75
  if @bz2_gem
71
76
  file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
72
- elsif RUBY_PLATFORM.index("win32")
77
+ elsif Gem.win_platform?
73
78
  file = IO.popen("bunzip2.exe -c #{@input_file}")
74
79
  elsif (bzpath = command_exist?("lbzip2") || command_exist?("pbzip2") || command_exist?("bzip2"))
75
80
  file = IO.popen("#{bzpath} -c -d #{@input_file}")
@@ -155,7 +160,7 @@ module Wp2txt
155
160
  @fp.puts(output_text) if output_text != ""
156
161
  @fp.close
157
162
 
158
- if File.size(outfilename).zero?
163
+ if outfilename && File.size(outfilename).zero?
159
164
  File.delete(outfilename)
160
165
  @outfiles.delete(outfilename)
161
166
  end
@@ -292,6 +297,7 @@ module Wp2txt
292
297
  @fp.puts(output_text)
293
298
  @fp.close
294
299
  end
300
+ @file_pointer.close
295
301
  File.delete(@input_file) if @del_interfile
296
302
  output_text = +""
297
303
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wp2txt
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-01-22 00:00:00.000000000 Z
11
+ date: 2023-04-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -208,7 +208,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
208
208
  - !ruby/object:Gem::Version
209
209
  version: '0'
210
210
  requirements: []
211
- rubygems_version: 3.4.1
211
+ rubygems_version: 3.3.3
212
212
  signing_key:
213
213
  specification_version: 4
214
214
  summary: A command-line toolkit to extract text content and category data from Wikipedia