wp2txt 1.1.2 → 1.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +5 -1
- data/bin/wp2txt +5 -3
- data/lib/wp2txt/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 62b3ee240d3ee685e3739eb4a8d6f9923677f75a97490d73286d64730b60ae5b
|
4
|
+
data.tar.gz: 1a2b9c3d23266b45d96e0f0984ebbcf9e610aa91adbdb3cc4891a23767cd0315
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 14eb3cb035ac0815e30bcab8d4eaffd6ed84f8b7449f6d9ee2492d9656bbd847a3da8aad00fcb331a8c55a9379ce90ff33a5018f6ab35f1a1046c87c4b57ab17
|
7
|
+
data.tar.gz: 68a4e8bf5952a433be23528078a3fa526a3bcceceb1c3f8a7f79412256c78b9f5732ece782e1eb311711dec86eb9490d5f1596eba12ba15a627ebdae4999c551
|
data/README.md
CHANGED
@@ -8,6 +8,10 @@ WP2TXT extracts text and category data from Wikipedia dump files (encoded in XML
|
|
8
8
|
|
9
9
|
## Changelog
|
10
10
|
|
11
|
+
**May 2023**
|
12
|
+
|
13
|
+
- Problems caused by too many parallel processors are addressed by setting the upper limit on the number of processors to 8.
|
14
|
+
|
11
15
|
**April 2023**
|
12
16
|
|
13
17
|
- File split/delete issues fixed
|
@@ -186,7 +190,7 @@ Command line options are as follows:
|
|
186
190
|
-g, --category-only Extract only article title and categories
|
187
191
|
-s, --summary-only Extract only article title, categories, and summary text before first heading
|
188
192
|
-f, --file-size=<i> Approximate size (in MB) of each output file (default: 10)
|
189
|
-
-n, --num-procs Number of proccesses to be run concurrently (default: max num of available CPU cores minus two)
|
193
|
+
-n, --num-procs Number of proccesses (up to 8) to be run concurrently (default: max num of available CPU cores minus two)
|
190
194
|
-x, --del-interfile Delete intermediate XML files from output dir
|
191
195
|
-t, --title, --no-title Keep page titles in output (default: true)
|
192
196
|
-d, --heading, --no-heading Keep section titles in output (default: true)
|
data/bin/wp2txt
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
# frozen_string_literal: true
|
4
4
|
|
5
5
|
DEBUG_MODE = false
|
6
|
+
MAX_PROCESSORS = 8
|
6
7
|
|
7
8
|
require_relative "../lib/wp2txt"
|
8
9
|
require_relative "../lib/wp2txt/utils"
|
@@ -34,7 +35,7 @@ class WpApp
|
|
34
35
|
opt :category_only, "Extract only article title and categories", default: false, short: "-g"
|
35
36
|
opt :summary_only, "Extract only article title, categories, and summary text before first heading", default: false, short: "-s"
|
36
37
|
opt :file_size, "Approximate size (in MB) of each output file", default: 10, short: "-f"
|
37
|
-
opt :num_procs, "Number of proccesses to be run concurrently (default: max num of CPU cores minus two)", short: "-n"
|
38
|
+
opt :num_procs, "Number of proccesses (up to #{MAX_PROCESSORS}) to be run concurrently (default: max num of CPU cores minus two)", type: Integer, short: "-n"
|
38
39
|
opt :del_interfile, "Delete intermediate XML files from output dir", short: "-x", default: false
|
39
40
|
opt :title, "Keep page titles in output", default: true, short: "-t"
|
40
41
|
opt :heading, "Keep section titles in output", default: true, short: "-d"
|
@@ -55,10 +56,11 @@ class WpApp
|
|
55
56
|
output_dir = opts[:output_dir]
|
56
57
|
tfile_size = opts[:file_size]
|
57
58
|
num_processors = Etc.nprocessors
|
58
|
-
num_processes = if opts[:num_procs] && opts[:num_procs].to_i <= num_processors
|
59
|
+
num_processes = if opts[:num_procs] && opts[:num_procs].to_i <= num_processors && opts[:num_procs].to_i <= MAX_PROCESSORS
|
59
60
|
opts[:num_procs]
|
60
61
|
else
|
61
|
-
num_processors - 2
|
62
|
+
minus2 = num_processors - 2
|
63
|
+
minus2 < MAX_PROCESSORS ? minus2 : MAX_PROCESSORS
|
62
64
|
end
|
63
65
|
num_processes = 1 if num_processes < 1
|
64
66
|
|
data/lib/wp2txt/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wp2txt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-05-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -208,7 +208,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
208
208
|
- !ruby/object:Gem::Version
|
209
209
|
version: '0'
|
210
210
|
requirements: []
|
211
|
-
rubygems_version: 3.
|
211
|
+
rubygems_version: 3.4.12
|
212
212
|
signing_key:
|
213
213
|
specification_version: 4
|
214
214
|
summary: A command-line toolkit to extract text content and category data from Wikipedia
|