wp2txt 1.1.0 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +12 -3
- data/bin/wp2txt +0 -2
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +16 -10
- metadata +3 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 5e8cda1ff32863bb95b4b314782e46007eabcd784938b4fe33f6660445a37d31
|
|
4
|
+
data.tar.gz: 805bd29ba8b660e705156bf7a4cc1d006b2f43a2c81c24e86ef210ef1fd1ef16
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: fe798d5ab55cefd55f776e4d0f975cb510a7c9c65af348ba216365827266d808b65f9125d9bc50c21cb05349ae71d07a46a80998ffd000bbcaac71b2eed15e45
|
|
7
|
+
data.tar.gz: 94f8df87a935b52d19f05adca27a01f64787dfa35fed067dc68cc1204b4b0022411f6cb6db1d2c9175987d27113e506f84d1761e4661c7d4f65f934c6ee1647e
|
data/README.md
CHANGED
|
@@ -8,6 +8,15 @@ WP2TXT extracts text and category data from Wikipedia dump files (encoded in XML
|
|
|
8
8
|
|
|
9
9
|
## Changelog
|
|
10
10
|
|
|
11
|
+
**April 2023**
|
|
12
|
+
|
|
13
|
+
- File split/delete issues fixed
|
|
14
|
+
|
|
15
|
+
**January 2023**
|
|
16
|
+
|
|
17
|
+
- Bug related to command line arguments fixed
|
|
18
|
+
- Code cleanup introducing Rubocop
|
|
19
|
+
|
|
11
20
|
**December 2022**
|
|
12
21
|
|
|
13
22
|
- Docker images available via Docker Hub
|
|
@@ -93,7 +102,7 @@ Download the latest Wikipedia dump file for the desired language at a URL such a
|
|
|
93
102
|
|
|
94
103
|
https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
|
|
95
104
|
|
|
96
|
-
Here, `enwiki` refers to the English Wikipedia. To get the Japanese Wikipedia dump file, for instance, change this to jawiki (Japanese). In doing so, note that there are two instances of `enwiki` in the URL above.
|
|
105
|
+
Here, `enwiki` refers to the English Wikipedia. To get the Japanese Wikipedia dump file, for instance, change this to `jawiki` (Japanese). In doing so, note that there are two instances of `enwiki` in the URL above.
|
|
97
106
|
|
|
98
107
|
Alternatively, you can also select Wikipedia dump files created on a specific date from [here](http://dumps.wikimedia.org/backup-index.html). Make sure to download a file named in the following format:
|
|
99
108
|
|
|
@@ -213,11 +222,11 @@ The author will appreciate your mentioning one of these in your research.
|
|
|
213
222
|
Or use this BibTeX entry:
|
|
214
223
|
|
|
215
224
|
```
|
|
216
|
-
@misc{
|
|
225
|
+
@misc{wp2txt_2023,
|
|
217
226
|
author = {Yoichiro Hasebe},
|
|
218
227
|
title = {WP2TXT: A command-line toolkit to extract text content and category data from Wikipedia dump files},
|
|
219
228
|
url = {https://github.com/yohasebe/wp2txt},
|
|
220
|
-
year = {
|
|
229
|
+
year = {2023}
|
|
221
230
|
}
|
|
222
231
|
```
|
|
223
232
|
|
data/bin/wp2txt
CHANGED
data/lib/wp2txt/version.rb
CHANGED
data/lib/wp2txt.rb
CHANGED
|
@@ -48,14 +48,19 @@ module Wp2txt
|
|
|
48
48
|
basename = File.basename(command)
|
|
49
49
|
path = +""
|
|
50
50
|
print "Checking #{basename}: "
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
51
|
+
begin
|
|
52
|
+
if open("| which #{command} 2>/dev/null") { |f| path = f.gets.strip }
|
|
53
|
+
puts "detected [#{path}]"
|
|
54
|
+
path.strip
|
|
55
|
+
elsif open("| which #{basename} 2>/dev/null") { |f| path = f.gets.strip }
|
|
56
|
+
puts "detected [#{path}]"
|
|
57
|
+
path.strip
|
|
58
|
+
else
|
|
59
|
+
puts "#{basename} not found"
|
|
60
|
+
false
|
|
61
|
+
end
|
|
62
|
+
rescue StandardError
|
|
63
|
+
puts "#{basename} not found"
|
|
59
64
|
false
|
|
60
65
|
end
|
|
61
66
|
end
|
|
@@ -69,7 +74,7 @@ module Wp2txt
|
|
|
69
74
|
if /.bz2$/ =~ @input_file
|
|
70
75
|
if @bz2_gem
|
|
71
76
|
file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
|
|
72
|
-
elsif
|
|
77
|
+
elsif Gem.win_platform?
|
|
73
78
|
file = IO.popen("bunzip2.exe -c #{@input_file}")
|
|
74
79
|
elsif (bzpath = command_exist?("lbzip2") || command_exist?("pbzip2") || command_exist?("bzip2"))
|
|
75
80
|
file = IO.popen("#{bzpath} -c -d #{@input_file}")
|
|
@@ -155,7 +160,7 @@ module Wp2txt
|
|
|
155
160
|
@fp.puts(output_text) if output_text != ""
|
|
156
161
|
@fp.close
|
|
157
162
|
|
|
158
|
-
if File.size(outfilename).zero?
|
|
163
|
+
if outfilename && File.size(outfilename).zero?
|
|
159
164
|
File.delete(outfilename)
|
|
160
165
|
@outfiles.delete(outfilename)
|
|
161
166
|
end
|
|
@@ -292,6 +297,7 @@ module Wp2txt
|
|
|
292
297
|
@fp.puts(output_text)
|
|
293
298
|
@fp.close
|
|
294
299
|
end
|
|
300
|
+
@file_pointer.close
|
|
295
301
|
File.delete(@input_file) if @del_interfile
|
|
296
302
|
output_text = +""
|
|
297
303
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: wp2txt
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.1.
|
|
4
|
+
version: 1.1.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Yoichiro Hasebe
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2023-
|
|
11
|
+
date: 2023-04-15 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|
|
@@ -208,7 +208,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
208
208
|
- !ruby/object:Gem::Version
|
|
209
209
|
version: '0'
|
|
210
210
|
requirements: []
|
|
211
|
-
rubygems_version: 3.
|
|
211
|
+
rubygems_version: 3.3.3
|
|
212
212
|
signing_key:
|
|
213
213
|
specification_version: 4
|
|
214
214
|
summary: A command-line toolkit to extract text content and category data from Wikipedia
|