wp2txt 1.1.1 → 1.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +9 -1
- data/bin/wp2txt +5 -5
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +2 -1
- metadata +3 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 62b3ee240d3ee685e3739eb4a8d6f9923677f75a97490d73286d64730b60ae5b
|
|
4
|
+
data.tar.gz: 1a2b9c3d23266b45d96e0f0984ebbcf9e610aa91adbdb3cc4891a23767cd0315
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 14eb3cb035ac0815e30bcab8d4eaffd6ed84f8b7449f6d9ee2492d9656bbd847a3da8aad00fcb331a8c55a9379ce90ff33a5018f6ab35f1a1046c87c4b57ab17
|
|
7
|
+
data.tar.gz: 68a4e8bf5952a433be23528078a3fa526a3bcceceb1c3f8a7f79412256c78b9f5732ece782e1eb311711dec86eb9490d5f1596eba12ba15a627ebdae4999c551
|
data/README.md
CHANGED
|
@@ -8,6 +8,14 @@ WP2TXT extracts text and category data from Wikipedia dump files (encoded in XML
|
|
|
8
8
|
|
|
9
9
|
## Changelog
|
|
10
10
|
|
|
11
|
+
**May 2023**
|
|
12
|
+
|
|
13
|
+
- Problems caused by too many parallel processors are addressed by setting the upper limit on the number of processors to 8.
|
|
14
|
+
|
|
15
|
+
**April 2023**
|
|
16
|
+
|
|
17
|
+
- File split/delete issues fixed
|
|
18
|
+
|
|
11
19
|
**January 2023**
|
|
12
20
|
|
|
13
21
|
- Bug related to command line arguments fixed
|
|
@@ -182,7 +190,7 @@ Command line options are as follows:
|
|
|
182
190
|
-g, --category-only Extract only article title and categories
|
|
183
191
|
-s, --summary-only Extract only article title, categories, and summary text before first heading
|
|
184
192
|
-f, --file-size=<i> Approximate size (in MB) of each output file (default: 10)
|
|
185
|
-
-n, --num-procs Number of proccesses to be run concurrently (default: max num of available CPU cores minus two)
|
|
193
|
+
-n, --num-procs Number of proccesses (up to 8) to be run concurrently (default: max num of available CPU cores minus two)
|
|
186
194
|
-x, --del-interfile Delete intermediate XML files from output dir
|
|
187
195
|
-t, --title, --no-title Keep page titles in output (default: true)
|
|
188
196
|
-d, --heading, --no-heading Keep section titles in output (default: true)
|
data/bin/wp2txt
CHANGED
|
@@ -3,8 +3,7 @@
|
|
|
3
3
|
# frozen_string_literal: true
|
|
4
4
|
|
|
5
5
|
DEBUG_MODE = false
|
|
6
|
-
|
|
7
|
-
DOCDIR = File.join(File.dirname(__FILE__), "..", "doc")
|
|
6
|
+
MAX_PROCESSORS = 8
|
|
8
7
|
|
|
9
8
|
require_relative "../lib/wp2txt"
|
|
10
9
|
require_relative "../lib/wp2txt/utils"
|
|
@@ -36,7 +35,7 @@ class WpApp
|
|
|
36
35
|
opt :category_only, "Extract only article title and categories", default: false, short: "-g"
|
|
37
36
|
opt :summary_only, "Extract only article title, categories, and summary text before first heading", default: false, short: "-s"
|
|
38
37
|
opt :file_size, "Approximate size (in MB) of each output file", default: 10, short: "-f"
|
|
39
|
-
opt :num_procs, "Number of proccesses to be run concurrently (default: max num of CPU cores minus two)", short: "-n"
|
|
38
|
+
opt :num_procs, "Number of proccesses (up to #{MAX_PROCESSORS}) to be run concurrently (default: max num of CPU cores minus two)", type: Integer, short: "-n"
|
|
40
39
|
opt :del_interfile, "Delete intermediate XML files from output dir", short: "-x", default: false
|
|
41
40
|
opt :title, "Keep page titles in output", default: true, short: "-t"
|
|
42
41
|
opt :heading, "Keep section titles in output", default: true, short: "-d"
|
|
@@ -57,10 +56,11 @@ class WpApp
|
|
|
57
56
|
output_dir = opts[:output_dir]
|
|
58
57
|
tfile_size = opts[:file_size]
|
|
59
58
|
num_processors = Etc.nprocessors
|
|
60
|
-
num_processes = if opts[:num_procs] && opts[:num_procs].to_i <= num_processors
|
|
59
|
+
num_processes = if opts[:num_procs] && opts[:num_procs].to_i <= num_processors && opts[:num_procs].to_i <= MAX_PROCESSORS
|
|
61
60
|
opts[:num_procs]
|
|
62
61
|
else
|
|
63
|
-
num_processors - 2
|
|
62
|
+
minus2 = num_processors - 2
|
|
63
|
+
minus2 < MAX_PROCESSORS ? minus2 : MAX_PROCESSORS
|
|
64
64
|
end
|
|
65
65
|
num_processes = 1 if num_processes < 1
|
|
66
66
|
|
data/lib/wp2txt/version.rb
CHANGED
data/lib/wp2txt.rb
CHANGED
|
@@ -160,7 +160,7 @@ module Wp2txt
|
|
|
160
160
|
@fp.puts(output_text) if output_text != ""
|
|
161
161
|
@fp.close
|
|
162
162
|
|
|
163
|
-
if File.size(outfilename).zero?
|
|
163
|
+
if outfilename && File.size(outfilename).zero?
|
|
164
164
|
File.delete(outfilename)
|
|
165
165
|
@outfiles.delete(outfilename)
|
|
166
166
|
end
|
|
@@ -297,6 +297,7 @@ module Wp2txt
|
|
|
297
297
|
@fp.puts(output_text)
|
|
298
298
|
@fp.close
|
|
299
299
|
end
|
|
300
|
+
@file_pointer.close
|
|
300
301
|
File.delete(@input_file) if @del_interfile
|
|
301
302
|
output_text = +""
|
|
302
303
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: wp2txt
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.1.
|
|
4
|
+
version: 1.1.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Yoichiro Hasebe
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2023-
|
|
11
|
+
date: 2023-05-13 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|
|
@@ -208,7 +208,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
208
208
|
- !ruby/object:Gem::Version
|
|
209
209
|
version: '0'
|
|
210
210
|
requirements: []
|
|
211
|
-
rubygems_version: 3.4.
|
|
211
|
+
rubygems_version: 3.4.12
|
|
212
212
|
signing_key:
|
|
213
213
|
specification_version: 4
|
|
214
214
|
summary: A command-line toolkit to extract text content and category data from Wikipedia
|