wp2txt 1.1.1 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0bcba2c84286504ae628176aad55dbdea05889dfaa7f471cf080ae933691cffc
4
- data.tar.gz: f83c63d7c6e91270da1da2aed54ab6e5c352c5695340ccd6378fdd20c43fc332
3
+ metadata.gz: 62b3ee240d3ee685e3739eb4a8d6f9923677f75a97490d73286d64730b60ae5b
4
+ data.tar.gz: 1a2b9c3d23266b45d96e0f0984ebbcf9e610aa91adbdb3cc4891a23767cd0315
5
5
  SHA512:
6
- metadata.gz: ae0eae028a98d4299a0e93278220b991e53e13deb80b88cce2971cd889d769808305e0d3aa8ee4e73af0cc55f07f27c0cc6a9f0d440e4693a410ba7d0a6333ba
7
- data.tar.gz: 25bff247bf80b4a0b5ff785ed51b60e3e21e6a3ca5e0bfeace9961df5060832ae047c3fa606a3267f9f770c253fd816176372c4d2f7ba73999d5317ea59933e6
6
+ metadata.gz: 14eb3cb035ac0815e30bcab8d4eaffd6ed84f8b7449f6d9ee2492d9656bbd847a3da8aad00fcb331a8c55a9379ce90ff33a5018f6ab35f1a1046c87c4b57ab17
7
+ data.tar.gz: 68a4e8bf5952a433be23528078a3fa526a3bcceceb1c3f8a7f79412256c78b9f5732ece782e1eb311711dec86eb9490d5f1596eba12ba15a627ebdae4999c551
data/README.md CHANGED
@@ -8,6 +8,14 @@ WP2TXT extracts text and category data from Wikipedia dump files (encoded in XML
8
8
 
9
9
  ## Changelog
10
10
 
11
+ **May 2023**
12
+
13
+ - Problems caused by too many parallel processors are addressed by setting the upper limit on the number of processors to 8.
14
+
15
+ **April 2023**
16
+
17
+ - File split/delete issues fixed
18
+
11
19
  **January 2023**
12
20
 
13
21
  - Bug related to command line arguments fixed
@@ -182,7 +190,7 @@ Command line options are as follows:
182
190
  -g, --category-only Extract only article title and categories
183
191
  -s, --summary-only Extract only article title, categories, and summary text before first heading
184
192
  -f, --file-size=<i> Approximate size (in MB) of each output file (default: 10)
185
- -n, --num-procs Number of proccesses to be run concurrently (default: max num of available CPU cores minus two)
193
+ -n, --num-procs Number of proccesses (up to 8) to be run concurrently (default: max num of available CPU cores minus two)
186
194
  -x, --del-interfile Delete intermediate XML files from output dir
187
195
  -t, --title, --no-title Keep page titles in output (default: true)
188
196
  -d, --heading, --no-heading Keep section titles in output (default: true)
data/bin/wp2txt CHANGED
@@ -3,8 +3,7 @@
3
3
  # frozen_string_literal: true
4
4
 
5
5
  DEBUG_MODE = false
6
- SHAREDIR = File.join(File.dirname(__FILE__), "..", "share")
7
- DOCDIR = File.join(File.dirname(__FILE__), "..", "doc")
6
+ MAX_PROCESSORS = 8
8
7
 
9
8
  require_relative "../lib/wp2txt"
10
9
  require_relative "../lib/wp2txt/utils"
@@ -36,7 +35,7 @@ class WpApp
36
35
  opt :category_only, "Extract only article title and categories", default: false, short: "-g"
37
36
  opt :summary_only, "Extract only article title, categories, and summary text before first heading", default: false, short: "-s"
38
37
  opt :file_size, "Approximate size (in MB) of each output file", default: 10, short: "-f"
39
- opt :num_procs, "Number of proccesses to be run concurrently (default: max num of CPU cores minus two)", short: "-n"
38
+ opt :num_procs, "Number of proccesses (up to #{MAX_PROCESSORS}) to be run concurrently (default: max num of CPU cores minus two)", type: Integer, short: "-n"
40
39
  opt :del_interfile, "Delete intermediate XML files from output dir", short: "-x", default: false
41
40
  opt :title, "Keep page titles in output", default: true, short: "-t"
42
41
  opt :heading, "Keep section titles in output", default: true, short: "-d"
@@ -57,10 +56,11 @@ class WpApp
57
56
  output_dir = opts[:output_dir]
58
57
  tfile_size = opts[:file_size]
59
58
  num_processors = Etc.nprocessors
60
- num_processes = if opts[:num_procs] && opts[:num_procs].to_i <= num_processors
59
+ num_processes = if opts[:num_procs] && opts[:num_procs].to_i <= num_processors && opts[:num_procs].to_i <= MAX_PROCESSORS
61
60
  opts[:num_procs]
62
61
  else
63
- num_processors - 2
62
+ minus2 = num_processors - 2
63
+ minus2 < MAX_PROCESSORS ? minus2 : MAX_PROCESSORS
64
64
  end
65
65
  num_processes = 1 if num_processes < 1
66
66
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Wp2txt
4
- VERSION = "1.1.1"
4
+ VERSION = "1.1.3"
5
5
  end
data/lib/wp2txt.rb CHANGED
@@ -160,7 +160,7 @@ module Wp2txt
160
160
  @fp.puts(output_text) if output_text != ""
161
161
  @fp.close
162
162
 
163
- if File.size(outfilename).zero?
163
+ if outfilename && File.size(outfilename).zero?
164
164
  File.delete(outfilename)
165
165
  @outfiles.delete(outfilename)
166
166
  end
@@ -297,6 +297,7 @@ module Wp2txt
297
297
  @fp.puts(output_text)
298
298
  @fp.close
299
299
  end
300
+ @file_pointer.close
300
301
  File.delete(@input_file) if @del_interfile
301
302
  output_text = +""
302
303
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wp2txt
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.1
4
+ version: 1.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-01-25 00:00:00.000000000 Z
11
+ date: 2023-05-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -208,7 +208,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
208
208
  - !ruby/object:Gem::Version
209
209
  version: '0'
210
210
  requirements: []
211
- rubygems_version: 3.4.2
211
+ rubygems_version: 3.4.12
212
212
  signing_key:
213
213
  specification_version: 4
214
214
  summary: A command-line toolkit to extract text content and category data from Wikipedia