RubyGems - wp2txt - Versions diffs - 1.0.1 → 1.0.2 - Mend

wp2txt 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: d33a41cf46688679a14eb8c3eb16f6ed33ce9175c7f5b566c9f87998ba2c8401
-  data.tar.gz: 7371e0f7b06b2f0846f01d66f461c7e106778adc6e686919302f0f29b1f80a9e
+  metadata.gz: bb540f4f17f7825786d110245c235ac556e3e64cedb17efae3e0591887425801
+  data.tar.gz: 479c357f7ba117ae10d9a5a04d24ce3aca2e54d942a156b02eb932c1aab55c8b
 SHA512:
-  metadata.gz: cab8d9c27989387acc6dbbe052029d2205508ce10e38b8eedc111c822328d8eba551d603020684cbb3844a87b747f261a5959f711267acd96a3b97ccef4f6834
-  data.tar.gz: 4de59be37d57ef3d14ae2304660e8dde069bdf645a7cff862026562b26327984f1be13840e9d6ec1f25110222367f71c84a0286b649d71fec0c13805c6b0a647
+  metadata.gz: 940d47d2c8bce06029fe76e3b3744563d089e26e297e5224b36e65d815295da57117eae84cbb43abeddf2f2c052e2a987d668cba52c7af6148e935b571b6d403
+  data.tar.gz: 8ce76523a3bf181ac7a5da11f088dd14cfb1e1d7ac0d5239832db52968d183db16a3ece6074513b634eebe0e5ca28ceea945eaef6542ecb1933266caf4e89a3c

data/README.md CHANGED Viewed

@@ -6,20 +6,26 @@ A command-line toolkit to extract text content and category data from Wikipedia
 WP2TXT extracts text and category data from Wikipedia dump files (encoded in XML / compressed with Bzip2), removing MediaWiki markup and other metadata.
-**UPDATE (August 2022)**
+## Changelog
-1. A new option `--category-only` has been added. When this option is enabled, only the title and category information of the article is extracted.
-2. A new option `--summary-only` has been added. If this option is enabled, only the title, category information, and opening paragraphs of the article will be extracted.
-3. Text conversion with the current version of WP2TXT is *more than 2x times faster* than the previous version due to parallel processing of multiple files (the rate of speedup depends on the CPU cores used for processing).
+**November 2022**
+- Code added to suppress "Invalid byte sequence error" when an ilegal UTF-8 character is input.
+**August 2022**
+- A new option `--category-only` has been added. When this option is enabled, only the title and category information of the article is extracted.
+- A new option `--summary-only` has been added. If this option is enabled, only the title, category information, and opening paragraphs of the article will be extracted.
+- Text conversion with the current version of WP2TXT is *more than 2x times faster* than the previous version due to parallel processing of multiple files (the rate of speedup depends on the CPU cores used for processing).
 ## Screenshot
-<img src='https://raw.githubusercontent.com/yohasebe/wp2txt/master/image/screenshot.png' width="700" />
+<img src='https://raw.githubusercontent.com/yohasebe/wp2txt/master/image/screenshot.png' width="800" />
-**Environment**
+**Environment**
 - WP2TXT 1.0.1
-- MacBook Pro (2021 Apple M1 Pro)
+- MacBook Pro (2021 Apple M1 Pro)
 - enwiki-20220720-pages-articles.xml.bz2 (19.98 GB)
 In the above environment, the process (decompression, splitting, extraction, and conversion) to obtain the plain text data of the English Wikipedia takes less than 1.5 hours.
@@ -34,7 +40,7 @@ In the above environment, the process (decompression, splitting, extraction, and
 ## Preparation
-### For MacOS / Linux/ WSL2
+### For MacOS and Linux
 WP2TXT requires that one of the following commands be installed on the system in order to decompress `bz2` files:
@@ -184,11 +190,11 @@ The author will appreciate your mentioning one of these in your research.
 Or use this BibTeX entry:
 ```
-@misc{WP2TXT_2022,
+@misc{wp2txt_2022,
   author = {Yoichiro Hasebe},
   title = {WP2TXT: A command-line toolkit to extract text content and category data from Wikipedia dump files},
-  url = {https://github.com/yohasebe/wp2txt}
-  year = {2022},
+  url = {https://github.com/yohasebe/wp2txt},
+  year = {2022}
 }
 ```

data/lib/wp2txt/utils.rb CHANGED Viewed

@@ -41,7 +41,7 @@ $in_table_regex2 = Regexp.new('^\|\}.*?$')
 $in_unordered_regex  = Regexp.new('^\*')
 $in_ordered_regex    = Regexp.new('^\#')
 $in_pre_regex = Regexp.new('^ ')
-$in_definition_regex  = Regexp.new('^[\;\:]')
+$in_definition_regex  = Regexp.new('^[\;\:]')
 $blank_line_regex = Regexp.new('^\s*$')
 $redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
 $remove_tag_regex = Regexp.new("\<[^\<\>]*\>")
@@ -98,11 +98,12 @@ $cleanup_regex_08 = Regexp.new('\n\n\n+', Regexp::MULTILINE)
 module Wp2txt
   def convert_characters!(text, has_retried = false)
-    begin
-      text << ""
+    begin
+      text << ""
       chrref_to_utf!(text)
       special_chr!(text)
+      text.encode!("UTF-8", "UTF-8", invalid: :replace, replace: "")
     rescue # detect invalid byte sequence in UTF-8
       if has_retried
         puts "invalid byte sequence detected"
@@ -112,20 +113,20 @@ module Wp2txt
         end
         exit
       else
-        text.encode!("UTF-16")
-        text.encode!("UTF-8")
+        text.encode!("UTF-16", "UTF-16", invalid: :replace, replace: "")
+        text.encode!("UTF-16", "UTF-16", invalid: :replace, replace: "")
         convert_characters!(text, true)
       end
     end
   end
   def format_wiki!(text, has_retried = false)
     remove_complex!(text)
     escape_nowiki!(text)
     process_interwiki_links!(text)
     process_external_links!(text)
-    unescape_nowiki!(text)
+    unescape_nowiki!(text)
     remove_directive!(text)
     remove_emphasis!(text)
     mndash!(text)
@@ -135,7 +136,7 @@ module Wp2txt
     remove_templates!(text) unless $leave_inline_template
     remove_table!(text) unless $leave_table
   end
   def cleanup!(text)
     text.gsub!($cleanup_regex_01){""}
     text.gsub!($cleanup_regex_02){""}
@@ -150,7 +151,7 @@ module Wp2txt
   end
   #################### parser for nested structure ####################
   def process_nested_structure(scanner, left, right, &block)
     test = false
     buffer = ""
@@ -195,7 +196,7 @@ module Wp2txt
     rescue => e
       return scanner.string
     end
-  end
+  end
   #################### methods used from format_wiki ####################
   def escape_nowiki!(str)
@@ -218,11 +219,11 @@ module Wp2txt
       @nowikis[obj_id]
     end
   end
   def process_interwiki_links!(str)
     scanner = StringScanner.new(str)
     result = process_nested_structure(scanner, "[[", "]]") do |contents|
-      parts = contents.split("|")
+      parts = contents.split("|")
       case parts.size
       when 1
         parts.first || ""
@@ -265,7 +266,7 @@ module Wp2txt
     end
     str.replace(result)
   end
   def remove_table!(str)
     scanner = StringScanner.new(str)
     result = process_nested_structure(scanner, "{|", "|}") do |contents|
@@ -273,7 +274,7 @@ module Wp2txt
     end
     str.replace(result)
   end
   def special_chr!(str)
     str.replace $html_decoder.decode(str)
   end
@@ -316,7 +317,7 @@ module Wp2txt
     end
     return true
   end
   def mndash!(str)
     str.gsub!($mndash_regex, "–")
   end
@@ -347,7 +348,7 @@ module Wp2txt
     str.gsub!($complex_regex_04){""}
     str.gsub!($complex_regex_05){""}
   end
   def make_reference!(str)
     str.gsub!($make_reference_regex_a){"\n"}
     str.gsub!($make_reference_regex_b){""}
@@ -413,7 +414,7 @@ module Wp2txt
     File.rename(file_path, file_path + ".bak")
     File.rename("temp", file_path)
     File.unlink(file_path + ".bak") unless backup
-  end
+  end
   # modify files under a directry (recursive)
   def batch_file_mod(dir_path, &block)
@@ -421,7 +422,7 @@ module Wp2txt
       collect_files(dir_path).each do |file|
         yield file if FileTest.file?(file)
       end
-    else
+    else
       yield dir_path if FileTest.file?(dir_path)
     end
   end
@@ -445,9 +446,9 @@ module Wp2txt
     end
   end
-  def rename(files, ext = "txt")
+  def rename(files, ext = "txt")
     # num of digits necessary to name the last file generated
-    maxwidth = 0
+    maxwidth = 0
     files.each do |f|
       width = f.slice(/\-(\d+)\z/, 1).to_s.length.to_i

data/lib/wp2txt/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Wp2txt
-  VERSION = "1.0.1"
+  VERSION = "1.0.2"
 end

data/tags ADDED Viewed

@@ -0,0 +1,58 @@
+!_TAG_FILE_FORMAT	2	/extended format; --format=1 will not append ;" to lines/
+!_TAG_FILE_SORTED	1	/0=unsorted, 1=sorted, 2=foldcase/
+!_TAG_PROGRAM_AUTHOR	Darren Hiebert	/dhiebert@users.sourceforge.net/
+!_TAG_PROGRAM_NAME	Exuberant Ctags	//
+!_TAG_PROGRAM_URL	http://ctags.sourceforge.net	/official site/
+!_TAG_PROGRAM_VERSION	5.8	//
+Article	lib/wp2txt/article.rb	/^  class Article$/;"	c	class:Wp2txt
+Runner	lib/wp2txt.rb	/^  class Runner$/;"	c	class:Wp2txt.Splitter.file_size
+Splitter	lib/wp2txt.rb	/^  class Splitter$/;"	c	class:Wp2txt
+Wp2txt	lib/wp2txt.rb	/^module Wp2txt$/;"	m
+Wp2txt	lib/wp2txt/article.rb	/^module Wp2txt$/;"	m
+Wp2txt	lib/wp2txt/utils.rb	/^module Wp2txt$/;"	m
+Wp2txt	lib/wp2txt/version.rb	/^module Wp2txt$/;"	m
+batch_file_mod	lib/wp2txt/utils.rb	/^  def batch_file_mod(dir_path, &block)$/;"	f
+chrref_to_utf!	lib/wp2txt/utils.rb	/^  def chrref_to_utf!(num_str)$/;"	f
+cleanup!	lib/wp2txt/utils.rb	/^  def cleanup!(text)$/;"	f
+collect_files	lib/wp2txt/utils.rb	/^  def collect_files(str, regex = nil)$/;"	f
+command_exist?	lib/wp2txt.rb	/^    def command_exist?(command)$/;"	f	class:Wp2txt.Splitter.file_size
+convert_characters!	lib/wp2txt/utils.rb	/^  def convert_characters!(text, has_retried = false)$/;"	f	class:Wp2txt
+correct_inline_template!	lib/wp2txt/utils.rb	/^  def correct_inline_template!(str)$/;"	f
+correct_separator	lib/wp2txt/utils.rb	/^  def correct_separator(input)$/;"	f
+create_element	lib/wp2txt/article.rb	/^    def create_element(tp, text)$/;"	f	class:Wp2txt.Article
+escape_nowiki!	lib/wp2txt/utils.rb	/^  def escape_nowiki!(str)$/;"	f
+extract_text	lib/wp2txt.rb	/^    def extract_text(&block)$/;"	f	class:Wp2txt.Splitter.file_size.Runner.fill_buffer
+file_mod	lib/wp2txt/utils.rb	/^  def file_mod(file_path, backup = false, &block)$/;"	f
+file_size	lib/wp2txt.rb	/^    def file_size(file)$/;"	f	class:Wp2txt.Splitter
+fill_buffer	lib/wp2txt.rb	/^    def fill_buffer$/;"	f	class:Wp2txt.Splitter.file_size
+fill_buffer	lib/wp2txt.rb	/^    def fill_buffer$/;"	f	class:Wp2txt.Splitter.file_size.Runner
+format_wiki!	lib/wp2txt/utils.rb	/^  def format_wiki!(text, has_retried = false)$/;"	f
+get_newline	lib/wp2txt.rb	/^    def get_newline$/;"	f	class:Wp2txt.Splitter.file_size.Runner.fill_buffer
+get_newline	lib/wp2txt.rb	/^    def get_newline$/;"	f	class:Wp2txt.Splitter.file_size.fill_buffer
+get_page	lib/wp2txt.rb	/^    def get_page$/;"	f	class:Wp2txt.Splitter.file_size.Runner.fill_buffer
+initialize	lib/wp2txt.rb	/^    def initialize(input_file, output_dir = ".", strip_tmarker = false, del_interfile = true)$/;"	f	class:Wp2txt.Splitter.file_size.Runner
+initialize	lib/wp2txt.rb	/^    def initialize(input_file, output_dir = ".", tfile_size = 10, bz2_gem = false)$/;"	f	class:Wp2txt.Splitter
+initialize	lib/wp2txt/article.rb	/^    def initialize(text, title = "", strip_tmarker = false)$/;"	f	class:Wp2txt.Article
+make_reference!	lib/wp2txt/utils.rb	/^  def make_reference!(str)$/;"	f
+mndash!	lib/wp2txt/utils.rb	/^  def mndash!(str)$/;"	f
+parse	lib/wp2txt/article.rb	/^    def parse(source)$/;"	f	class:Wp2txt.Article
+prepare	lib/wp2txt.rb	/^    def prepare$/;"	f	class:Wp2txt.Splitter.file_size
+prepare	lib/wp2txt.rb	/^    def prepare$/;"	f	class:Wp2txt.Splitter.file_size.Runner
+process_external_links!	lib/wp2txt/utils.rb	/^  def process_external_links!(str)$/;"	f
+process_interwiki_links!	lib/wp2txt/utils.rb	/^  def process_interwiki_links!(str)$/;"	f
+process_nested_structure	lib/wp2txt/utils.rb	/^  def process_nested_structure(scanner, left, right, &block)$/;"	f
+remove_complex!	lib/wp2txt/utils.rb	/^  def remove_complex!(str)$/;"	f
+remove_directive!	lib/wp2txt/utils.rb	/^  def remove_directive!(str)$/;"	f
+remove_emphasis!	lib/wp2txt/utils.rb	/^  def remove_emphasis!(str)$/;"	f
+remove_hr!	lib/wp2txt/utils.rb	/^  def remove_hr!(str)$/;"	f
+remove_html!	lib/wp2txt/utils.rb	/^  def remove_html!(str)$/;"	f
+remove_inbetween!	lib/wp2txt/utils.rb	/^  def remove_inbetween!(str, tagset = ['<', '>'])$/;"	f
+remove_ref!	lib/wp2txt/utils.rb	/^  def remove_ref!(str)$/;"	f
+remove_table!	lib/wp2txt/utils.rb	/^  def remove_table!(str)$/;"	f
+remove_tag!	lib/wp2txt/utils.rb	/^  def remove_tag!(str)$/;"	f
+remove_templates!	lib/wp2txt/utils.rb	/^  def remove_templates!(str)$/;"	f
+rename	lib/wp2txt/utils.rb	/^  def rename(files, ext = "txt")$/;"	f
+sec_to_str	lib/wp2txt/utils.rb	/^  def sec_to_str(int)$/;"	f
+special_chr!	lib/wp2txt/utils.rb	/^  def special_chr!(str)$/;"	f
+split_file	lib/wp2txt.rb	/^    def split_file$/;"	f	class:Wp2txt.Splitter.file_size.fill_buffer
+unescape_nowiki!	lib/wp2txt/utils.rb	/^  def unescape_nowiki!(str)$/;"	f

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: wp2txt
 version: !ruby/object:Gem::Version
-  version: 1.0.1
+  version: 1.0.2
 platform: ruby
 authors:
 - Yoichiro Hasebe
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2022-08-11 00:00:00.000000000 Z
+date: 2022-11-25 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
@@ -140,6 +140,7 @@ files:
 - lib/wp2txt/version.rb
 - spec/spec_helper.rb
 - spec/utils_spec.rb
+- tags
 - wp2txt.gemspec
 homepage: https://github.com/yohasebe/wp2txt
 licenses: []
@@ -159,7 +160,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.3.7
+rubygems_version: 3.3.3
 signing_key:
 specification_version: 4
 summary: A command-line toolkit to extract text content and category data from Wikipedia