wp2txt 1.1.2 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5e8cda1ff32863bb95b4b314782e46007eabcd784938b4fe33f6660445a37d31
4
- data.tar.gz: 805bd29ba8b660e705156bf7a4cc1d006b2f43a2c81c24e86ef210ef1fd1ef16
3
+ metadata.gz: 62b3ee240d3ee685e3739eb4a8d6f9923677f75a97490d73286d64730b60ae5b
4
+ data.tar.gz: 1a2b9c3d23266b45d96e0f0984ebbcf9e610aa91adbdb3cc4891a23767cd0315
5
5
  SHA512:
6
- metadata.gz: fe798d5ab55cefd55f776e4d0f975cb510a7c9c65af348ba216365827266d808b65f9125d9bc50c21cb05349ae71d07a46a80998ffd000bbcaac71b2eed15e45
7
- data.tar.gz: 94f8df87a935b52d19f05adca27a01f64787dfa35fed067dc68cc1204b4b0022411f6cb6db1d2c9175987d27113e506f84d1761e4661c7d4f65f934c6ee1647e
6
+ metadata.gz: 14eb3cb035ac0815e30bcab8d4eaffd6ed84f8b7449f6d9ee2492d9656bbd847a3da8aad00fcb331a8c55a9379ce90ff33a5018f6ab35f1a1046c87c4b57ab17
7
+ data.tar.gz: 68a4e8bf5952a433be23528078a3fa526a3bcceceb1c3f8a7f79412256c78b9f5732ece782e1eb311711dec86eb9490d5f1596eba12ba15a627ebdae4999c551
data/README.md CHANGED
@@ -8,6 +8,10 @@ WP2TXT extracts text and category data from Wikipedia dump files (encoded in XML
8
8
 
9
9
  ## Changelog
10
10
 
11
+ **May 2023**
12
+
13
+ - Problems caused by too many parallel processors are addressed by setting the upper limit on the number of processors to 8.
14
+
11
15
  **April 2023**
12
16
 
13
17
  - File split/delete issues fixed
@@ -186,7 +190,7 @@ Command line options are as follows:
186
190
  -g, --category-only Extract only article title and categories
187
191
  -s, --summary-only Extract only article title, categories, and summary text before first heading
188
192
  -f, --file-size=<i> Approximate size (in MB) of each output file (default: 10)
189
- -n, --num-procs Number of proccesses to be run concurrently (default: max num of available CPU cores minus two)
193
+ -n, --num-procs Number of proccesses (up to 8) to be run concurrently (default: max num of available CPU cores minus two)
190
194
  -x, --del-interfile Delete intermediate XML files from output dir
191
195
  -t, --title, --no-title Keep page titles in output (default: true)
192
196
  -d, --heading, --no-heading Keep section titles in output (default: true)
data/bin/wp2txt CHANGED
@@ -3,6 +3,7 @@
3
3
  # frozen_string_literal: true
4
4
 
5
5
  DEBUG_MODE = false
6
+ MAX_PROCESSORS = 8
6
7
 
7
8
  require_relative "../lib/wp2txt"
8
9
  require_relative "../lib/wp2txt/utils"
@@ -34,7 +35,7 @@ class WpApp
34
35
  opt :category_only, "Extract only article title and categories", default: false, short: "-g"
35
36
  opt :summary_only, "Extract only article title, categories, and summary text before first heading", default: false, short: "-s"
36
37
  opt :file_size, "Approximate size (in MB) of each output file", default: 10, short: "-f"
37
- opt :num_procs, "Number of proccesses to be run concurrently (default: max num of CPU cores minus two)", short: "-n"
38
+ opt :num_procs, "Number of proccesses (up to #{MAX_PROCESSORS}) to be run concurrently (default: max num of CPU cores minus two)", type: Integer, short: "-n"
38
39
  opt :del_interfile, "Delete intermediate XML files from output dir", short: "-x", default: false
39
40
  opt :title, "Keep page titles in output", default: true, short: "-t"
40
41
  opt :heading, "Keep section titles in output", default: true, short: "-d"
@@ -55,10 +56,11 @@ class WpApp
55
56
  output_dir = opts[:output_dir]
56
57
  tfile_size = opts[:file_size]
57
58
  num_processors = Etc.nprocessors
58
- num_processes = if opts[:num_procs] && opts[:num_procs].to_i <= num_processors
59
+ num_processes = if opts[:num_procs] && opts[:num_procs].to_i <= num_processors && opts[:num_procs].to_i <= MAX_PROCESSORS
59
60
  opts[:num_procs]
60
61
  else
61
- num_processors - 2
62
+ minus2 = num_processors - 2
63
+ minus2 < MAX_PROCESSORS ? minus2 : MAX_PROCESSORS
62
64
  end
63
65
  num_processes = 1 if num_processes < 1
64
66
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Wp2txt
4
- VERSION = "1.1.2"
4
+ VERSION = "1.1.3"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wp2txt
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.2
4
+ version: 1.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-04-15 00:00:00.000000000 Z
11
+ date: 2023-05-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -208,7 +208,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
208
208
  - !ruby/object:Gem::Version
209
209
  version: '0'
210
210
  requirements: []
211
- rubygems_version: 3.3.3
211
+ rubygems_version: 3.4.12
212
212
  signing_key:
213
213
  specification_version: 4
214
214
  summary: A command-line toolkit to extract text content and category data from Wikipedia