RubyGems - wp2txt - Versions diffs - 1.1.1 → 1.1.3 - Mend

wp2txt 1.1.1 → 1.1.3

Files changed (6) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 0bcba2c84286504ae628176aad55dbdea05889dfaa7f471cf080ae933691cffc
-  data.tar.gz: f83c63d7c6e91270da1da2aed54ab6e5c352c5695340ccd6378fdd20c43fc332
+  metadata.gz: 62b3ee240d3ee685e3739eb4a8d6f9923677f75a97490d73286d64730b60ae5b
+  data.tar.gz: 1a2b9c3d23266b45d96e0f0984ebbcf9e610aa91adbdb3cc4891a23767cd0315
 SHA512:
-  metadata.gz: ae0eae028a98d4299a0e93278220b991e53e13deb80b88cce2971cd889d769808305e0d3aa8ee4e73af0cc55f07f27c0cc6a9f0d440e4693a410ba7d0a6333ba
-  data.tar.gz: 25bff247bf80b4a0b5ff785ed51b60e3e21e6a3ca5e0bfeace9961df5060832ae047c3fa606a3267f9f770c253fd816176372c4d2f7ba73999d5317ea59933e6
+  metadata.gz: 14eb3cb035ac0815e30bcab8d4eaffd6ed84f8b7449f6d9ee2492d9656bbd847a3da8aad00fcb331a8c55a9379ce90ff33a5018f6ab35f1a1046c87c4b57ab17
+  data.tar.gz: 68a4e8bf5952a433be23528078a3fa526a3bcceceb1c3f8a7f79412256c78b9f5732ece782e1eb311711dec86eb9490d5f1596eba12ba15a627ebdae4999c551

data/README.md CHANGED Viewed

@@ -8,6 +8,14 @@ WP2TXT extracts text and category data from Wikipedia dump files (encoded in XML
 ## Changelog
+**May 2023**
+- Problems caused by too many parallel processors are addressed by setting the upper limit on the number of processors to 8.
+**April 2023**
+- File split/delete issues fixed
 **January 2023**
 - Bug related to command line arguments fixed
@@ -182,7 +190,7 @@ Command line options are as follows:
       -g, --category-only              Extract only article title and categories
       -s, --summary-only               Extract only article title, categories, and summary text before first heading
       -f, --file-size=<i>              Approximate size (in MB) of each output file (default: 10)
-      -n, --num-procs                  Number of proccesses to be run concurrently (default: max num of available CPU cores minus two)
+      -n, --num-procs                  Number of proccesses (up to 8) to be run concurrently (default: max num of available CPU cores minus two)
       -x, --del-interfile              Delete intermediate XML files from output dir
       -t, --title, --no-title          Keep page titles in output (default: true)
       -d, --heading, --no-heading      Keep section titles in output (default: true)

data/bin/wp2txt CHANGED Viewed

@@ -3,8 +3,7 @@
 # frozen_string_literal: true
 DEBUG_MODE = false
-SHAREDIR = File.join(File.dirname(__FILE__), "..", "share")
-DOCDIR = File.join(File.dirname(__FILE__), "..", "doc")
+MAX_PROCESSORS = 8
 require_relative "../lib/wp2txt"
 require_relative "../lib/wp2txt/utils"
@@ -36,7 +35,7 @@ class WpApp
       opt :category_only, "Extract only article title and categories", default: false, short: "-g"
       opt :summary_only, "Extract only article title, categories, and summary text before first heading", default: false, short: "-s"
       opt :file_size, "Approximate size (in MB) of each output file", default: 10, short: "-f"
-      opt :num_procs, "Number of proccesses to be run concurrently (default: max num of CPU cores minus two)", short: "-n"
+      opt :num_procs, "Number of proccesses (up to #{MAX_PROCESSORS}) to be run concurrently (default: max num of CPU cores minus two)", type: Integer, short: "-n"
       opt :del_interfile, "Delete intermediate XML files from output dir", short: "-x", default: false
       opt :title, "Keep page titles in output", default: true, short: "-t"
       opt :heading, "Keep section titles in output", default: true, short: "-d"
@@ -57,10 +56,11 @@ class WpApp
     output_dir = opts[:output_dir]
     tfile_size = opts[:file_size]
     num_processors = Etc.nprocessors
-    num_processes = if opts[:num_procs] && opts[:num_procs].to_i <= num_processors
+    num_processes = if opts[:num_procs] && opts[:num_procs].to_i <= num_processors && opts[:num_procs].to_i <= MAX_PROCESSORS
                       opts[:num_procs]
                     else
-                      num_processors - 2
+                      minus2 = num_processors - 2
+                      minus2 < MAX_PROCESSORS ? minus2 : MAX_PROCESSORS
                     end
     num_processes = 1 if num_processes < 1

data/lib/wp2txt/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Wp2txt
-  VERSION = "1.1.1"
+  VERSION = "1.1.3"
 end

data/lib/wp2txt.rb CHANGED Viewed

@@ -160,7 +160,7 @@ module Wp2txt
       @fp.puts(output_text) if output_text != ""
       @fp.close
-      if File.size(outfilename).zero?
+      if outfilename && File.size(outfilename).zero?
         File.delete(outfilename)
         @outfiles.delete(outfilename)
       end
@@ -297,6 +297,7 @@ module Wp2txt
           @fp.puts(output_text)
           @fp.close
         end
+        @file_pointer.close
         File.delete(@input_file) if @del_interfile
         output_text = +""
       end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: wp2txt
 version: !ruby/object:Gem::Version
-  version: 1.1.1
+  version: 1.1.3
 platform: ruby
 authors:
 - Yoichiro Hasebe
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2023-01-25 00:00:00.000000000 Z
+date: 2023-05-13 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -208,7 +208,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.4.2
+rubygems_version: 3.4.12
 signing_key:
 specification_version: 4
 summary: A command-line toolkit to extract text content and category data from Wikipedia