wp2txt 1.1.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +8 -3
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +14 -9
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0bcba2c84286504ae628176aad55dbdea05889dfaa7f471cf080ae933691cffc
|
4
|
+
data.tar.gz: f83c63d7c6e91270da1da2aed54ab6e5c352c5695340ccd6378fdd20c43fc332
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ae0eae028a98d4299a0e93278220b991e53e13deb80b88cce2971cd889d769808305e0d3aa8ee4e73af0cc55f07f27c0cc6a9f0d440e4693a410ba7d0a6333ba
|
7
|
+
data.tar.gz: 25bff247bf80b4a0b5ff785ed51b60e3e21e6a3ca5e0bfeace9961df5060832ae047c3fa606a3267f9f770c253fd816176372c4d2f7ba73999d5317ea59933e6
|
data/README.md
CHANGED
@@ -8,6 +8,11 @@ WP2TXT extracts text and category data from Wikipedia dump files (encoded in XML
|
|
8
8
|
|
9
9
|
## Changelog
|
10
10
|
|
11
|
+
**January 2023**
|
12
|
+
|
13
|
+
- Bug related to command line arguments fixed
|
14
|
+
- Code cleanup introducing Rubocop
|
15
|
+
|
11
16
|
**December 2022**
|
12
17
|
|
13
18
|
- Docker images available via Docker Hub
|
@@ -93,7 +98,7 @@ Download the latest Wikipedia dump file for the desired language at a URL such a
|
|
93
98
|
|
94
99
|
https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
|
95
100
|
|
96
|
-
Here, `enwiki` refers to the English Wikipedia. To get the Japanese Wikipedia dump file, for instance, change this to jawiki (Japanese). In doing so, note that there are two instances of `enwiki` in the URL above.
|
101
|
+
Here, `enwiki` refers to the English Wikipedia. To get the Japanese Wikipedia dump file, for instance, change this to `jawiki` (Japanese). In doing so, note that there are two instances of `enwiki` in the URL above.
|
97
102
|
|
98
103
|
Alternatively, you can also select Wikipedia dump files created on a specific date from [here](http://dumps.wikimedia.org/backup-index.html). Make sure to download a file named in the following format:
|
99
104
|
|
@@ -213,11 +218,11 @@ The author will appreciate your mentioning one of these in your research.
|
|
213
218
|
Or use this BibTeX entry:
|
214
219
|
|
215
220
|
```
|
216
|
-
@misc{
|
221
|
+
@misc{wp2txt_2023,
|
217
222
|
author = {Yoichiro Hasebe},
|
218
223
|
title = {WP2TXT: A command-line toolkit to extract text content and category data from Wikipedia dump files},
|
219
224
|
url = {https://github.com/yohasebe/wp2txt},
|
220
|
-
year = {
|
225
|
+
year = {2023}
|
221
226
|
}
|
222
227
|
```
|
223
228
|
|
data/lib/wp2txt/version.rb
CHANGED
data/lib/wp2txt.rb
CHANGED
@@ -48,14 +48,19 @@ module Wp2txt
|
|
48
48
|
basename = File.basename(command)
|
49
49
|
path = +""
|
50
50
|
print "Checking #{basename}: "
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
51
|
+
begin
|
52
|
+
if open("| which #{command} 2>/dev/null") { |f| path = f.gets.strip }
|
53
|
+
puts "detected [#{path}]"
|
54
|
+
path.strip
|
55
|
+
elsif open("| which #{basename} 2>/dev/null") { |f| path = f.gets.strip }
|
56
|
+
puts "detected [#{path}]"
|
57
|
+
path.strip
|
58
|
+
else
|
59
|
+
puts "#{basename} not found"
|
60
|
+
false
|
61
|
+
end
|
62
|
+
rescue StandardError
|
63
|
+
puts "#{basename} not found"
|
59
64
|
false
|
60
65
|
end
|
61
66
|
end
|
@@ -69,7 +74,7 @@ module Wp2txt
|
|
69
74
|
if /.bz2$/ =~ @input_file
|
70
75
|
if @bz2_gem
|
71
76
|
file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
|
72
|
-
elsif
|
77
|
+
elsif Gem.win_platform?
|
73
78
|
file = IO.popen("bunzip2.exe -c #{@input_file}")
|
74
79
|
elsif (bzpath = command_exist?("lbzip2") || command_exist?("pbzip2") || command_exist?("bzip2"))
|
75
80
|
file = IO.popen("#{bzpath} -c -d #{@input_file}")
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wp2txt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-01-
|
11
|
+
date: 2023-01-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -208,7 +208,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
208
208
|
- !ruby/object:Gem::Version
|
209
209
|
version: '0'
|
210
210
|
requirements: []
|
211
|
-
rubygems_version: 3.4.
|
211
|
+
rubygems_version: 3.4.2
|
212
212
|
signing_key:
|
213
213
|
specification_version: 4
|
214
214
|
summary: A command-line toolkit to extract text content and category data from Wikipedia
|