RubyGems - wp2txt - Versions diffs - 1.0.1 → 1.0.2 - Mend

wp2txt 1.0.1 → 1.0.2

Files changed (6) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: d33a41cf46688679a14eb8c3eb16f6ed33ce9175c7f5b566c9f87998ba2c8401
-  data.tar.gz: 7371e0f7b06b2f0846f01d66f461c7e106778adc6e686919302f0f29b1f80a9e
+  metadata.gz: bb540f4f17f7825786d110245c235ac556e3e64cedb17efae3e0591887425801
+  data.tar.gz: 479c357f7ba117ae10d9a5a04d24ce3aca2e54d942a156b02eb932c1aab55c8b
 SHA512:
-  metadata.gz: cab8d9c27989387acc6dbbe052029d2205508ce10e38b8eedc111c822328d8eba551d603020684cbb3844a87b747f261a5959f711267acd96a3b97ccef4f6834
-  data.tar.gz: 4de59be37d57ef3d14ae2304660e8dde069bdf645a7cff862026562b26327984f1be13840e9d6ec1f25110222367f71c84a0286b649d71fec0c13805c6b0a647
+  metadata.gz: 940d47d2c8bce06029fe76e3b3744563d089e26e297e5224b36e65d815295da57117eae84cbb43abeddf2f2c052e2a987d668cba52c7af6148e935b571b6d403
+  data.tar.gz: 8ce76523a3bf181ac7a5da11f088dd14cfb1e1d7ac0d5239832db52968d183db16a3ece6074513b634eebe0e5ca28ceea945eaef6542ecb1933266caf4e89a3c

data/README.md CHANGED Viewed

@@ -6,20 +6,26 @@ A command-line toolkit to extract text content and category data from Wikipedia
 WP2TXT extracts text and category data from Wikipedia dump files (encoded in XML / compressed with Bzip2), removing MediaWiki markup and other metadata.
-**UPDATE (August 2022)**
+## Changelog
-1. A new option `--category-only` has been added. When this option is enabled, only the title and category information of the article is extracted.
-2. A new option `--summary-only` has been added. If this option is enabled, only the title, category information, and opening paragraphs of the article will be extracted.
-3. Text conversion with the current version of WP2TXT is *more than 2x times faster* than the previous version due to parallel processing of multiple files (the rate of speedup depends on the CPU cores used for processing).
+**November 2022**
+- Code added to suppress "Invalid byte sequence error" when an ilegal UTF-8 character is input.
+**August 2022**
+- A new option `--category-only` has been added. When this option is enabled, only the title and category information of the article is extracted.
+- A new option `--summary-only` has been added. If this option is enabled, only the title, category information, and opening paragraphs of the article will be extracted.
+- Text conversion with the current version of WP2TXT is *more than 2x times faster* than the previous version due to parallel processing of multiple files (the rate of speedup depends on the CPU cores used for processing).
 ## Screenshot
-<img src='https://raw.githubusercontent.com/yohasebe/wp2txt/master/image/screenshot.png' width="700" />
+<img src='https://raw.githubusercontent.com/yohasebe/wp2txt/master/image/screenshot.png' width="800" />
-**Environment**
+**Environment**
 - WP2TXT 1.0.1
-- MacBook Pro (2021 Apple M1 Pro)
+- MacBook Pro (2021 Apple M1 Pro)
 - enwiki-20220720-pages-articles.xml.bz2 (19.98 GB)
 In the above environment, the process (decompression, splitting, extraction, and conversion) to obtain the plain text data of the English Wikipedia takes less than 1.5 hours.
@@ -34,7 +40,7 @@ In the above environment, the process (decompression, splitting, extraction, and
 ## Preparation
-### For MacOS / Linux/ WSL2
+### For MacOS and Linux
 WP2TXT requires that one of the following commands be installed on the system in order to decompress `bz2` files:
@@ -184,11 +190,11 @@ The author will appreciate your mentioning one of these in your research.
 Or use this BibTeX entry:
 ```
-@misc{WP2TXT_2022,
+@misc{wp2txt_2022,
   author = {Yoichiro Hasebe},
   title = {WP2TXT: A command-line toolkit to extract text content and category data from Wikipedia dump files},
-  url = {https://github.com/yohasebe/wp2txt}
-  year = {2022},
+  url = {https://github.com/yohasebe/wp2txt},
+  year = {2022}
 }
 ```

data/lib/wp2txt/utils.rb CHANGED Viewed

@@ -41,7 +41,7 @@ $in_table_regex2 = Regexp.new('^\|\}.*?$')
 $in_unordered_regex  = Regexp.new('^\*')
 $in_ordered_regex    = Regexp.new('^\#')
 $in_pre_regex = Regexp.new('^ ')
-$in_definition_regex  = Regexp.new('^[\;\:]')
+$in_definition_regex  = Regexp.new('^[\;\:]')
 $blank_line_regex = Regexp.new('^\s*$')
 $redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
 $remove_tag_regex = Regexp.new("\<[^\<\>]*\>")
@@ -98,11 +98,12 @@ $cleanup_regex_08 = Regexp.new('\n\n\n+', Regexp::MULTILINE)
 module Wp2txt
   def convert_characters!(text, has_retried = false)
-    begin
-      text << ""
+    begin
+      text << ""
       chrref_to_utf!(text)
       special_chr!(text)
+      text.encode!("UTF-8", "UTF-8", invalid: :replace, replace: "")
     rescue # detect invalid byte sequence in UTF-8
       if has_retried
         puts "invalid byte sequence detected"
@@ -112,20 +113,20 @@ module Wp2txt
         end
         exit
       else
-        text.encode!("UTF-16")
-        text.encode!("UTF-8")
+        text.encode!("UTF-16", "UTF-16", invalid: :replace, replace: "")
+        text.encode!("UTF-16", "UTF-16", invalid: :replace, replace: "")
         convert_characters!(text, true)
       end
     end
   end
   def format_wiki!(text, has_retried = false)
     remove_complex!(text)
     escape_nowiki!(text)
     process_interwiki_links!(text)
     process_external_links!(text)
-    unescape_nowiki!(text)
+    unescape_nowiki!(text)
     remove_directive!(text)
     remove_emphasis!(text)
     mndash!(text)
@@ -135,7 +136,7 @@ module Wp2txt
     remove_templates!(text) unless $leave_inline_template
     remove_table!(text) unless $leave_table
   end
   def cleanup!(text)
     text.gsub!($cleanup_regex_01){""}
     text.gsub!($cleanup_regex_02){""}
@@ -150,7 +151,7 @@ module Wp2txt
   end
   #################### parser for nested structure ####################
   def process_nested_structure(scanner, left, right, &block)
     test = false
     buffer = ""
@@ -195,7 +196,7 @@ module Wp2txt
     rescue => e
       return scanner.string
     end
-  end
+  end
   #################### methods used from format_wiki ####################
   def escape_nowiki!(str)
@@ -218,11 +219,11 @@ module Wp2txt
       @nowikis[obj_id]
     end
   end
   def process_interwiki_links!(str)
     scanner = StringScanner.new(str)
     result = process_nested_structure(scanner, "[[", "]]") do |contents|
-      parts = contents.split("|")
+      parts = contents.split("|")
       case parts.size
       when 1
         parts.first || ""
@@ -265,7 +266,7 @@ module Wp2txt
     end
     str.replace(result)
   end
   def remove_table!(str)
     scanner = StringScanner.new(str)
     result = process_nested_structure(scanner, "{|", "|}") do |contents|
@@ -273,7 +274,7 @@ module Wp2txt
     end
     str.replace(result)
   end
   def special_chr!(str)
     str.replace $html_decoder.decode(str)
   end
@@ -316,7 +317,7 @@ module Wp2txt
     end
     return true
   end
   def mndash!(str)
     str.gsub!($mndash_regex, "–")
   end
@@ -347,7 +348,7 @@ module Wp2txt
     str.gsub!($complex_regex_04){""}
     str.gsub!($complex_regex_05){""}
   end
   def make_reference!(str)
     str.gsub!($make_reference_regex_a){"\n"}
     str.gsub!($make_reference_regex_b){""}
@@ -413,7 +414,7 @@ module Wp2txt
     File.rename(file_path, file_path + ".bak")
     File.rename("temp", file_path)
     File.unlink(file_path + ".bak") unless backup
-  end
+  end
   # modify files under a directry (recursive)
   def batch_file_mod(dir_path, &block)
@@ -421,7 +422,7 @@ module Wp2txt
       collect_files(dir_path).each do |file|
         yield file if FileTest.file?(file)
       end
-    else
+    else
       yield dir_path if FileTest.file?(dir_path)
     end
   end
@@ -445,9 +446,9 @@ module Wp2txt
     end
   end
-  def rename(files, ext = "txt")
+  def rename(files, ext = "txt")
     # num of digits necessary to name the last file generated
-    maxwidth = 0
+    maxwidth = 0
     files.each do |f|
       width = f.slice(/\-(\d+)\z/, 1).to_s.length.to_i

data/lib/wp2txt/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Wp2txt
-  VERSION = "1.0.1"
+  VERSION = "1.0.2"
 end

data/tags ADDED Viewed

@@ -0,0 +1,58 @@
+!_TAG_FILE_FORMAT	2	/extended format; --format=1 will not append ;" to lines/
+!_TAG_FILE_SORTED	1	/0=unsorted, 1=sorted, 2=foldcase/
+!_TAG_PROGRAM_AUTHOR	Darren Hiebert	/dhiebert@users.sourceforge.net/
+!_TAG_PROGRAM_NAME	Exuberant Ctags	//
+!_TAG_PROGRAM_URL	http://ctags.sourceforge.net	/official site/
+!_TAG_PROGRAM_VERSION	5.8	//
+Article	lib/wp2txt/article.rb	/^  class Article$/;"	c	class:Wp2txt
+Runner	lib/wp2txt.rb	/^  class Runner$/;"	c	class:Wp2txt.Splitter.file_size
+Splitter	lib/wp2txt.rb	/^  class Splitter$/;"	c	class:Wp2txt
+Wp2txt	lib/wp2txt.rb	/^module Wp2txt$/;"	m
+Wp2txt	lib/wp2txt/article.rb	/^module Wp2txt$/;"	m
+Wp2txt	lib/wp2txt/utils.rb	/^module Wp2txt$/;"	m
+Wp2txt	lib/wp2txt/version.rb	/^module Wp2txt$/;"	m
+batch_file_mod	lib/wp2txt/utils.rb	/^  def batch_file_mod(dir_path, &block)$/;"	f
+chrref_to_utf!	lib/wp2txt/utils.rb	/^  def chrref_to_utf!(num_str)$/;"	f
+cleanup!	lib/wp2txt/utils.rb	/^  def cleanup!(text)$/;"	f
+collect_files	lib/wp2txt/utils.rb	/^  def collect_files(str, regex = nil)$/;"	f
+command_exist?	lib/wp2txt.rb	/^    def command_exist?(command)$/;"	f	class:Wp2txt.Splitter.file_size
+convert_characters!	lib/wp2txt/utils.rb	/^  def convert_characters!(text, has_retried = false)$/;"	f	class:Wp2txt
+correct_inline_template!	lib/wp2txt/utils.rb	/^  def correct_inline_template!(str)$/;"	f
+correct_separator	lib/wp2txt/utils.rb	/^  def correct_separator(input)$/;"	f
+create_element	lib/wp2txt/article.rb	/^    def create_element(tp, text)$/;"	f	class:Wp2txt.Article
+escape_nowiki!	lib/wp2txt/utils.rb	/^  def escape_nowiki!(str)$/;"	f
+extract_text	lib/wp2txt.rb	/^    def extract_text(&block)$/;"	f	class:Wp2txt.Splitter.file_size.Runner.fill_buffer
+file_mod	lib/wp2txt/utils.rb	/^  def file_mod(file_path, backup = false, &block)$/;"	f
+file_size	lib/wp2txt.rb	/^    def file_size(file)$/;"	f	class:Wp2txt.Splitter
+fill_buffer	lib/wp2txt.rb	/^    def fill_buffer$/;"	f	class:Wp2txt.Splitter.file_size
+fill_buffer	lib/wp2txt.rb	/^    def fill_buffer$/;"	f	class:Wp2txt.Splitter.file_size.Runner
+format_wiki!	lib/wp2txt/utils.rb	/^  def format_wiki!(text, has_retried = false)$/;"	f
+get_newline	lib/wp2txt.rb	/^    def get_newline$/;"	f	class:Wp2txt.Splitter.file_size.Runner.fill_buffer
+get_newline	lib/wp2txt.rb	/^    def get_newline$/;"	f	class:Wp2txt.Splitter.file_size.fill_buffer
+get_page	lib/wp2txt.rb	/^    def get_page$/;"	f	class:Wp2txt.Splitter.file_size.Runner.fill_buffer
+initialize	lib/wp2txt.rb	/^    def initialize(input_file, output_dir = ".", strip_tmarker = false, del_interfile = true)$/;"	f	class:Wp2txt.Splitter.file_size.Runner
+initialize	lib/wp2txt.rb	/^    def initialize(input_file, output_dir = ".", tfile_size = 10, bz2_gem = false)$/;"	f	class:Wp2txt.Splitter
+initialize	lib/wp2txt/article.rb	/^    def initialize(text, title = "", strip_tmarker = false)$/;"	f	class:Wp2txt.Article
+make_reference!	lib/wp2txt/utils.rb	/^  def make_reference!(str)$/;"	f
+mndash!	lib/wp2txt/utils.rb	/^  def mndash!(str)$/;"	f
+parse	lib/wp2txt/article.rb	/^    def parse(source)$/;"	f	class:Wp2txt.Article
+prepare	lib/wp2txt.rb	/^    def prepare$/;"	f	class:Wp2txt.Splitter.file_size
+prepare	lib/wp2txt.rb	/^    def prepare$/;"	f	class:Wp2txt.Splitter.file_size.Runner
+process_external_links!	lib/wp2txt/utils.rb	/^  def process_external_links!(str)$/;"	f
+process_interwiki_links!	lib/wp2txt/utils.rb	/^  def process_interwiki_links!(str)$/;"	f
+process_nested_structure	lib/wp2txt/utils.rb	/^  def process_nested_structure(scanner, left, right, &block)$/;"	f
+remove_complex!	lib/wp2txt/utils.rb	/^  def remove_complex!(str)$/;"	f
+remove_directive!	lib/wp2txt/utils.rb	/^  def remove_directive!(str)$/;"	f
+remove_emphasis!	lib/wp2txt/utils.rb	/^  def remove_emphasis!(str)$/;"	f
+remove_hr!	lib/wp2txt/utils.rb	/^  def remove_hr!(str)$/;"	f
+remove_html!	lib/wp2txt/utils.rb	/^  def remove_html!(str)$/;"	f
+remove_inbetween!	lib/wp2txt/utils.rb	/^  def remove_inbetween!(str, tagset = ['<', '>'])$/;"	f
+remove_ref!	lib/wp2txt/utils.rb	/^  def remove_ref!(str)$/;"	f
+remove_table!	lib/wp2txt/utils.rb	/^  def remove_table!(str)$/;"	f
+remove_tag!	lib/wp2txt/utils.rb	/^  def remove_tag!(str)$/;"	f
+remove_templates!	lib/wp2txt/utils.rb	/^  def remove_templates!(str)$/;"	f
+rename	lib/wp2txt/utils.rb	/^  def rename(files, ext = "txt")$/;"	f
+sec_to_str	lib/wp2txt/utils.rb	/^  def sec_to_str(int)$/;"	f
+special_chr!	lib/wp2txt/utils.rb	/^  def special_chr!(str)$/;"	f
+split_file	lib/wp2txt.rb	/^    def split_file$/;"	f	class:Wp2txt.Splitter.file_size.fill_buffer
+unescape_nowiki!	lib/wp2txt/utils.rb	/^  def unescape_nowiki!(str)$/;"	f

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: wp2txt
 version: !ruby/object:Gem::Version
-  version: 1.0.1
+  version: 1.0.2
 platform: ruby
 authors:
 - Yoichiro Hasebe
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2022-08-11 00:00:00.000000000 Z
+date: 2022-11-25 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
@@ -140,6 +140,7 @@ files:
 - lib/wp2txt/version.rb
 - spec/spec_helper.rb
 - spec/utils_spec.rb
+- tags
 - wp2txt.gemspec
 homepage: https://github.com/yohasebe/wp2txt
 licenses: []
@@ -159,7 +160,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.3.7
+rubygems_version: 3.3.3
 signing_key:
 specification_version: 4
 summary: A command-line toolkit to extract text content and category data from Wikipedia