wp2txt 1.1.1 → 1.1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0bcba2c84286504ae628176aad55dbdea05889dfaa7f471cf080ae933691cffc
4
- data.tar.gz: f83c63d7c6e91270da1da2aed54ab6e5c352c5695340ccd6378fdd20c43fc332
3
+ metadata.gz: 62b3ee240d3ee685e3739eb4a8d6f9923677f75a97490d73286d64730b60ae5b
4
+ data.tar.gz: 1a2b9c3d23266b45d96e0f0984ebbcf9e610aa91adbdb3cc4891a23767cd0315
5
5
  SHA512:
6
- metadata.gz: ae0eae028a98d4299a0e93278220b991e53e13deb80b88cce2971cd889d769808305e0d3aa8ee4e73af0cc55f07f27c0cc6a9f0d440e4693a410ba7d0a6333ba
7
- data.tar.gz: 25bff247bf80b4a0b5ff785ed51b60e3e21e6a3ca5e0bfeace9961df5060832ae047c3fa606a3267f9f770c253fd816176372c4d2f7ba73999d5317ea59933e6
6
+ metadata.gz: 14eb3cb035ac0815e30bcab8d4eaffd6ed84f8b7449f6d9ee2492d9656bbd847a3da8aad00fcb331a8c55a9379ce90ff33a5018f6ab35f1a1046c87c4b57ab17
7
+ data.tar.gz: 68a4e8bf5952a433be23528078a3fa526a3bcceceb1c3f8a7f79412256c78b9f5732ece782e1eb311711dec86eb9490d5f1596eba12ba15a627ebdae4999c551
data/README.md CHANGED
@@ -8,6 +8,14 @@ WP2TXT extracts text and category data from Wikipedia dump files (encoded in XML
8
8
 
9
9
  ## Changelog
10
10
 
11
+ **May 2023**
12
+
13
+ - Problems caused by too many parallel processors are addressed by setting the upper limit on the number of processors to 8.
14
+
15
+ **April 2023**
16
+
17
+ - File split/delete issues fixed
18
+
11
19
  **January 2023**
12
20
 
13
21
  - Bug related to command line arguments fixed
@@ -182,7 +190,7 @@ Command line options are as follows:
182
190
  -g, --category-only Extract only article title and categories
183
191
  -s, --summary-only Extract only article title, categories, and summary text before first heading
184
192
  -f, --file-size=<i> Approximate size (in MB) of each output file (default: 10)
185
- -n, --num-procs Number of proccesses to be run concurrently (default: max num of available CPU cores minus two)
193
+ -n, --num-procs Number of proccesses (up to 8) to be run concurrently (default: max num of available CPU cores minus two)
186
194
  -x, --del-interfile Delete intermediate XML files from output dir
187
195
  -t, --title, --no-title Keep page titles in output (default: true)
188
196
  -d, --heading, --no-heading Keep section titles in output (default: true)
data/bin/wp2txt CHANGED
@@ -3,8 +3,7 @@
3
3
  # frozen_string_literal: true
4
4
 
5
5
  DEBUG_MODE = false
6
- SHAREDIR = File.join(File.dirname(__FILE__), "..", "share")
7
- DOCDIR = File.join(File.dirname(__FILE__), "..", "doc")
6
+ MAX_PROCESSORS = 8
8
7
 
9
8
  require_relative "../lib/wp2txt"
10
9
  require_relative "../lib/wp2txt/utils"
@@ -36,7 +35,7 @@ class WpApp
36
35
  opt :category_only, "Extract only article title and categories", default: false, short: "-g"
37
36
  opt :summary_only, "Extract only article title, categories, and summary text before first heading", default: false, short: "-s"
38
37
  opt :file_size, "Approximate size (in MB) of each output file", default: 10, short: "-f"
39
- opt :num_procs, "Number of proccesses to be run concurrently (default: max num of CPU cores minus two)", short: "-n"
38
+ opt :num_procs, "Number of proccesses (up to #{MAX_PROCESSORS}) to be run concurrently (default: max num of CPU cores minus two)", type: Integer, short: "-n"
40
39
  opt :del_interfile, "Delete intermediate XML files from output dir", short: "-x", default: false
41
40
  opt :title, "Keep page titles in output", default: true, short: "-t"
42
41
  opt :heading, "Keep section titles in output", default: true, short: "-d"
@@ -57,10 +56,11 @@ class WpApp
57
56
  output_dir = opts[:output_dir]
58
57
  tfile_size = opts[:file_size]
59
58
  num_processors = Etc.nprocessors
60
- num_processes = if opts[:num_procs] && opts[:num_procs].to_i <= num_processors
59
+ num_processes = if opts[:num_procs] && opts[:num_procs].to_i <= num_processors && opts[:num_procs].to_i <= MAX_PROCESSORS
61
60
  opts[:num_procs]
62
61
  else
63
- num_processors - 2
62
+ minus2 = num_processors - 2
63
+ minus2 < MAX_PROCESSORS ? minus2 : MAX_PROCESSORS
64
64
  end
65
65
  num_processes = 1 if num_processes < 1
66
66
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Wp2txt
4
- VERSION = "1.1.1"
4
+ VERSION = "1.1.3"
5
5
  end
data/lib/wp2txt.rb CHANGED
@@ -160,7 +160,7 @@ module Wp2txt
160
160
  @fp.puts(output_text) if output_text != ""
161
161
  @fp.close
162
162
 
163
- if File.size(outfilename).zero?
163
+ if outfilename && File.size(outfilename).zero?
164
164
  File.delete(outfilename)
165
165
  @outfiles.delete(outfilename)
166
166
  end
@@ -297,6 +297,7 @@ module Wp2txt
297
297
  @fp.puts(output_text)
298
298
  @fp.close
299
299
  end
300
+ @file_pointer.close
300
301
  File.delete(@input_file) if @del_interfile
301
302
  output_text = +""
302
303
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wp2txt
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.1
4
+ version: 1.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-01-25 00:00:00.000000000 Z
11
+ date: 2023-05-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -208,7 +208,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
208
208
  - !ruby/object:Gem::Version
209
209
  version: '0'
210
210
  requirements: []
211
- rubygems_version: 3.4.2
211
+ rubygems_version: 3.4.12
212
212
  signing_key:
213
213
  specification_version: 4
214
214
  summary: A command-line toolkit to extract text content and category data from Wikipedia