wp2txt 1.1.2 → 1.1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5e8cda1ff32863bb95b4b314782e46007eabcd784938b4fe33f6660445a37d31
4
- data.tar.gz: 805bd29ba8b660e705156bf7a4cc1d006b2f43a2c81c24e86ef210ef1fd1ef16
3
+ metadata.gz: 62b3ee240d3ee685e3739eb4a8d6f9923677f75a97490d73286d64730b60ae5b
4
+ data.tar.gz: 1a2b9c3d23266b45d96e0f0984ebbcf9e610aa91adbdb3cc4891a23767cd0315
5
5
  SHA512:
6
- metadata.gz: fe798d5ab55cefd55f776e4d0f975cb510a7c9c65af348ba216365827266d808b65f9125d9bc50c21cb05349ae71d07a46a80998ffd000bbcaac71b2eed15e45
7
- data.tar.gz: 94f8df87a935b52d19f05adca27a01f64787dfa35fed067dc68cc1204b4b0022411f6cb6db1d2c9175987d27113e506f84d1761e4661c7d4f65f934c6ee1647e
6
+ metadata.gz: 14eb3cb035ac0815e30bcab8d4eaffd6ed84f8b7449f6d9ee2492d9656bbd847a3da8aad00fcb331a8c55a9379ce90ff33a5018f6ab35f1a1046c87c4b57ab17
7
+ data.tar.gz: 68a4e8bf5952a433be23528078a3fa526a3bcceceb1c3f8a7f79412256c78b9f5732ece782e1eb311711dec86eb9490d5f1596eba12ba15a627ebdae4999c551
data/README.md CHANGED
@@ -8,6 +8,10 @@ WP2TXT extracts text and category data from Wikipedia dump files (encoded in XML
8
8
 
9
9
  ## Changelog
10
10
 
11
+ **May 2023**
12
+
13
+ - Problems caused by too many parallel processors are addressed by setting the upper limit on the number of processors to 8.
14
+
11
15
  **April 2023**
12
16
 
13
17
  - File split/delete issues fixed
@@ -186,7 +190,7 @@ Command line options are as follows:
186
190
  -g, --category-only Extract only article title and categories
187
191
  -s, --summary-only Extract only article title, categories, and summary text before first heading
188
192
  -f, --file-size=<i> Approximate size (in MB) of each output file (default: 10)
189
- -n, --num-procs Number of proccesses to be run concurrently (default: max num of available CPU cores minus two)
193
+ -n, --num-procs Number of proccesses (up to 8) to be run concurrently (default: max num of available CPU cores minus two)
190
194
  -x, --del-interfile Delete intermediate XML files from output dir
191
195
  -t, --title, --no-title Keep page titles in output (default: true)
192
196
  -d, --heading, --no-heading Keep section titles in output (default: true)
data/bin/wp2txt CHANGED
@@ -3,6 +3,7 @@
3
3
  # frozen_string_literal: true
4
4
 
5
5
  DEBUG_MODE = false
6
+ MAX_PROCESSORS = 8
6
7
 
7
8
  require_relative "../lib/wp2txt"
8
9
  require_relative "../lib/wp2txt/utils"
@@ -34,7 +35,7 @@ class WpApp
34
35
  opt :category_only, "Extract only article title and categories", default: false, short: "-g"
35
36
  opt :summary_only, "Extract only article title, categories, and summary text before first heading", default: false, short: "-s"
36
37
  opt :file_size, "Approximate size (in MB) of each output file", default: 10, short: "-f"
37
- opt :num_procs, "Number of proccesses to be run concurrently (default: max num of CPU cores minus two)", short: "-n"
38
+ opt :num_procs, "Number of proccesses (up to #{MAX_PROCESSORS}) to be run concurrently (default: max num of CPU cores minus two)", type: Integer, short: "-n"
38
39
  opt :del_interfile, "Delete intermediate XML files from output dir", short: "-x", default: false
39
40
  opt :title, "Keep page titles in output", default: true, short: "-t"
40
41
  opt :heading, "Keep section titles in output", default: true, short: "-d"
@@ -55,10 +56,11 @@ class WpApp
55
56
  output_dir = opts[:output_dir]
56
57
  tfile_size = opts[:file_size]
57
58
  num_processors = Etc.nprocessors
58
- num_processes = if opts[:num_procs] && opts[:num_procs].to_i <= num_processors
59
+ num_processes = if opts[:num_procs] && opts[:num_procs].to_i <= num_processors && opts[:num_procs].to_i <= MAX_PROCESSORS
59
60
  opts[:num_procs]
60
61
  else
61
- num_processors - 2
62
+ minus2 = num_processors - 2
63
+ minus2 < MAX_PROCESSORS ? minus2 : MAX_PROCESSORS
62
64
  end
63
65
  num_processes = 1 if num_processes < 1
64
66
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Wp2txt
4
- VERSION = "1.1.2"
4
+ VERSION = "1.1.3"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wp2txt
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.2
4
+ version: 1.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-04-15 00:00:00.000000000 Z
11
+ date: 2023-05-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -208,7 +208,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
208
208
  - !ruby/object:Gem::Version
209
209
  version: '0'
210
210
  requirements: []
211
- rubygems_version: 3.3.3
211
+ rubygems_version: 3.4.12
212
212
  signing_key:
213
213
  specification_version: 4
214
214
  summary: A command-line toolkit to extract text content and category data from Wikipedia