wp2txt 1.1.0 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 62f1e8d6ab1932f3ae3c34fb71930b7e73500c832481dcea6288742c38850a79
4
- data.tar.gz: f0ff0a5488b635b828338d41029c5ad191a0c88282c0fa294a9facf2d93c055b
3
+ metadata.gz: 5e8cda1ff32863bb95b4b314782e46007eabcd784938b4fe33f6660445a37d31
4
+ data.tar.gz: 805bd29ba8b660e705156bf7a4cc1d006b2f43a2c81c24e86ef210ef1fd1ef16
5
5
  SHA512:
6
- metadata.gz: 7bca85758e88d53dcd33fe43e83a251624f89329a2cb55ffb97b41141bcf8fe5ace7c48e3b8e49f5aa42f84724247cfe4ad376238a949e9154876d4d07469afe
7
- data.tar.gz: de59399d5163afed2947e0802abf2e0365894d566c8a1f11823bc901d4948346e7af47d6fba558387f5af7e1301a6725a51a322ac1cd4810264dc3003e0729e2
6
+ metadata.gz: fe798d5ab55cefd55f776e4d0f975cb510a7c9c65af348ba216365827266d808b65f9125d9bc50c21cb05349ae71d07a46a80998ffd000bbcaac71b2eed15e45
7
+ data.tar.gz: 94f8df87a935b52d19f05adca27a01f64787dfa35fed067dc68cc1204b4b0022411f6cb6db1d2c9175987d27113e506f84d1761e4661c7d4f65f934c6ee1647e
data/README.md CHANGED
@@ -8,6 +8,15 @@ WP2TXT extracts text and category data from Wikipedia dump files (encoded in XML
8
8
 
9
9
  ## Changelog
10
10
 
11
+ **April 2023**
12
+
13
+ - File split/delete issues fixed
14
+
15
+ **January 2023**
16
+
17
+ - Bug related to command line arguments fixed
18
+ - Code cleanup introducing Rubocop
19
+
11
20
  **December 2022**
12
21
 
13
22
  - Docker images available via Docker Hub
@@ -93,7 +102,7 @@ Download the latest Wikipedia dump file for the desired language at a URL such a
93
102
 
94
103
  https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
95
104
 
96
- Here, `enwiki` refers to the English Wikipedia. To get the Japanese Wikipedia dump file, for instance, change this to jawiki (Japanese). In doing so, note that there are two instances of `enwiki` in the URL above.
105
+ Here, `enwiki` refers to the English Wikipedia. To get the Japanese Wikipedia dump file, for instance, change this to `jawiki` (Japanese). In doing so, note that there are two instances of `enwiki` in the URL above.
97
106
 
98
107
  Alternatively, you can also select Wikipedia dump files created on a specific date from [here](http://dumps.wikimedia.org/backup-index.html). Make sure to download a file named in the following format:
99
108
 
@@ -213,11 +222,11 @@ The author will appreciate your mentioning one of these in your research.
213
222
  Or use this BibTeX entry:
214
223
 
215
224
  ```
216
- @misc{wp2txt_2022,
225
+ @misc{wp2txt_2023,
217
226
  author = {Yoichiro Hasebe},
218
227
  title = {WP2TXT: A command-line toolkit to extract text content and category data from Wikipedia dump files},
219
228
  url = {https://github.com/yohasebe/wp2txt},
220
- year = {2022}
229
+ year = {2023}
221
230
  }
222
231
  ```
223
232
 
data/bin/wp2txt CHANGED
@@ -3,8 +3,6 @@
3
3
  # frozen_string_literal: true
4
4
 
5
5
  DEBUG_MODE = false
6
- SHAREDIR = File.join(File.dirname(__FILE__), "..", "share")
7
- DOCDIR = File.join(File.dirname(__FILE__), "..", "doc")
8
6
 
9
7
  require_relative "../lib/wp2txt"
10
8
  require_relative "../lib/wp2txt/utils"
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Wp2txt
4
- VERSION = "1.1.0"
4
+ VERSION = "1.1.2"
5
5
  end
data/lib/wp2txt.rb CHANGED
@@ -48,14 +48,19 @@ module Wp2txt
48
48
  basename = File.basename(command)
49
49
  path = +""
50
50
  print "Checking #{basename}: "
51
- if open("| which #{command} 2>/dev/null") { |f| path = f.gets.strip }
52
- puts "detected [#{path}]"
53
- path.strip
54
- elsif open("| which #{basename} 2>/dev/null") { |f| path = f.gets.strip }
55
- puts "detected [#{path}]"
56
- path.strip
57
- else
58
- puts "not found"
51
+ begin
52
+ if open("| which #{command} 2>/dev/null") { |f| path = f.gets.strip }
53
+ puts "detected [#{path}]"
54
+ path.strip
55
+ elsif open("| which #{basename} 2>/dev/null") { |f| path = f.gets.strip }
56
+ puts "detected [#{path}]"
57
+ path.strip
58
+ else
59
+ puts "#{basename} not found"
60
+ false
61
+ end
62
+ rescue StandardError
63
+ puts "#{basename} not found"
59
64
  false
60
65
  end
61
66
  end
@@ -69,7 +74,7 @@ module Wp2txt
69
74
  if /.bz2$/ =~ @input_file
70
75
  if @bz2_gem
71
76
  file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
72
- elsif RUBY_PLATFORM.index("win32")
77
+ elsif Gem.win_platform?
73
78
  file = IO.popen("bunzip2.exe -c #{@input_file}")
74
79
  elsif (bzpath = command_exist?("lbzip2") || command_exist?("pbzip2") || command_exist?("bzip2"))
75
80
  file = IO.popen("#{bzpath} -c -d #{@input_file}")
@@ -155,7 +160,7 @@ module Wp2txt
155
160
  @fp.puts(output_text) if output_text != ""
156
161
  @fp.close
157
162
 
158
- if File.size(outfilename).zero?
163
+ if outfilename && File.size(outfilename).zero?
159
164
  File.delete(outfilename)
160
165
  @outfiles.delete(outfilename)
161
166
  end
@@ -292,6 +297,7 @@ module Wp2txt
292
297
  @fp.puts(output_text)
293
298
  @fp.close
294
299
  end
300
+ @file_pointer.close
295
301
  File.delete(@input_file) if @del_interfile
296
302
  output_text = +""
297
303
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wp2txt
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-01-22 00:00:00.000000000 Z
11
+ date: 2023-04-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -208,7 +208,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
208
208
  - !ruby/object:Gem::Version
209
209
  version: '0'
210
210
  requirements: []
211
- rubygems_version: 3.4.1
211
+ rubygems_version: 3.3.3
212
212
  signing_key:
213
213
  specification_version: 4
214
214
  summary: A command-line toolkit to extract text content and category data from Wikipedia