wp2txt 1.1.2 → 1.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +5 -1
- data/bin/wp2txt +5 -3
- data/lib/wp2txt/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 62b3ee240d3ee685e3739eb4a8d6f9923677f75a97490d73286d64730b60ae5b
|
4
|
+
data.tar.gz: 1a2b9c3d23266b45d96e0f0984ebbcf9e610aa91adbdb3cc4891a23767cd0315
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 14eb3cb035ac0815e30bcab8d4eaffd6ed84f8b7449f6d9ee2492d9656bbd847a3da8aad00fcb331a8c55a9379ce90ff33a5018f6ab35f1a1046c87c4b57ab17
|
7
|
+
data.tar.gz: 68a4e8bf5952a433be23528078a3fa526a3bcceceb1c3f8a7f79412256c78b9f5732ece782e1eb311711dec86eb9490d5f1596eba12ba15a627ebdae4999c551
|
data/README.md
CHANGED
@@ -8,6 +8,10 @@ WP2TXT extracts text and category data from Wikipedia dump files (encoded in XML
|
|
8
8
|
|
9
9
|
## Changelog
|
10
10
|
|
11
|
+
**May 2023**
|
12
|
+
|
13
|
+
- Problems caused by too many parallel processors are addressed by setting the upper limit on the number of processors to 8.
|
14
|
+
|
11
15
|
**April 2023**
|
12
16
|
|
13
17
|
- File split/delete issues fixed
|
@@ -186,7 +190,7 @@ Command line options are as follows:
|
|
186
190
|
-g, --category-only Extract only article title and categories
|
187
191
|
-s, --summary-only Extract only article title, categories, and summary text before first heading
|
188
192
|
-f, --file-size=<i> Approximate size (in MB) of each output file (default: 10)
|
189
|
-
-n, --num-procs Number of proccesses to be run concurrently (default: max num of available CPU cores minus two)
|
193
|
+
-n, --num-procs Number of proccesses (up to 8) to be run concurrently (default: max num of available CPU cores minus two)
|
190
194
|
-x, --del-interfile Delete intermediate XML files from output dir
|
191
195
|
-t, --title, --no-title Keep page titles in output (default: true)
|
192
196
|
-d, --heading, --no-heading Keep section titles in output (default: true)
|
data/bin/wp2txt
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
# frozen_string_literal: true
|
4
4
|
|
5
5
|
DEBUG_MODE = false
|
6
|
+
MAX_PROCESSORS = 8
|
6
7
|
|
7
8
|
require_relative "../lib/wp2txt"
|
8
9
|
require_relative "../lib/wp2txt/utils"
|
@@ -34,7 +35,7 @@ class WpApp
|
|
34
35
|
opt :category_only, "Extract only article title and categories", default: false, short: "-g"
|
35
36
|
opt :summary_only, "Extract only article title, categories, and summary text before first heading", default: false, short: "-s"
|
36
37
|
opt :file_size, "Approximate size (in MB) of each output file", default: 10, short: "-f"
|
37
|
-
opt :num_procs, "Number of proccesses to be run concurrently (default: max num of CPU cores minus two)", short: "-n"
|
38
|
+
opt :num_procs, "Number of proccesses (up to #{MAX_PROCESSORS}) to be run concurrently (default: max num of CPU cores minus two)", type: Integer, short: "-n"
|
38
39
|
opt :del_interfile, "Delete intermediate XML files from output dir", short: "-x", default: false
|
39
40
|
opt :title, "Keep page titles in output", default: true, short: "-t"
|
40
41
|
opt :heading, "Keep section titles in output", default: true, short: "-d"
|
@@ -55,10 +56,11 @@ class WpApp
|
|
55
56
|
output_dir = opts[:output_dir]
|
56
57
|
tfile_size = opts[:file_size]
|
57
58
|
num_processors = Etc.nprocessors
|
58
|
-
num_processes = if opts[:num_procs] && opts[:num_procs].to_i <= num_processors
|
59
|
+
num_processes = if opts[:num_procs] && opts[:num_procs].to_i <= num_processors && opts[:num_procs].to_i <= MAX_PROCESSORS
|
59
60
|
opts[:num_procs]
|
60
61
|
else
|
61
|
-
num_processors - 2
|
62
|
+
minus2 = num_processors - 2
|
63
|
+
minus2 < MAX_PROCESSORS ? minus2 : MAX_PROCESSORS
|
62
64
|
end
|
63
65
|
num_processes = 1 if num_processes < 1
|
64
66
|
|
data/lib/wp2txt/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wp2txt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-05-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -208,7 +208,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
208
208
|
- !ruby/object:Gem::Version
|
209
209
|
version: '0'
|
210
210
|
requirements: []
|
211
|
-
rubygems_version: 3.
|
211
|
+
rubygems_version: 3.4.12
|
212
212
|
signing_key:
|
213
213
|
specification_version: 4
|
214
214
|
summary: A command-line toolkit to extract text content and category data from Wikipedia
|