RubyGems - wp2txt - Versions diffs - 1.1.2 → 2.1.0 - Mend

wp2txt 1.1.2 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

checksums.yaml +4 -4
data/.dockerignore +12 -0
data/.github/workflows/ci.yml +13 -13
data/.gitignore +14 -0
data/CHANGELOG.md +284 -0
data/DEVELOPMENT.md +415 -0
data/DEVELOPMENT_ja.md +415 -0
data/Dockerfile +19 -10
data/Gemfile +2 -8
data/README.md +261 -121
data/README_ja.md +375 -0
data/Rakefile +4 -0
data/bin/wp2txt +863 -159
data/lib/wp2txt/article.rb +98 -13
data/lib/wp2txt/bz2_validator.rb +239 -0
data/lib/wp2txt/category_cache.rb +313 -0
data/lib/wp2txt/cli.rb +319 -0
data/lib/wp2txt/cli_ui.rb +428 -0
data/lib/wp2txt/config.rb +158 -0
data/lib/wp2txt/constants.rb +134 -0
data/lib/wp2txt/data/html_entities.json +2135 -0
data/lib/wp2txt/data/language_metadata.json +4769 -0
data/lib/wp2txt/data/language_tiers.json +59 -0
data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
data/lib/wp2txt/data/template_aliases.json +193 -0
data/lib/wp2txt/data/wikipedia_entities.json +12 -0
data/lib/wp2txt/extractor.rb +545 -0
data/lib/wp2txt/file_utils.rb +91 -0
data/lib/wp2txt/formatter.rb +352 -0
data/lib/wp2txt/global_data_cache.rb +353 -0
data/lib/wp2txt/index_cache.rb +258 -0
data/lib/wp2txt/magic_words.rb +353 -0
data/lib/wp2txt/memory_monitor.rb +236 -0
data/lib/wp2txt/multistream.rb +1383 -0
data/lib/wp2txt/output_writer.rb +182 -0
data/lib/wp2txt/parser_functions.rb +606 -0
data/lib/wp2txt/ractor_worker.rb +215 -0
data/lib/wp2txt/regex.rb +396 -12
data/lib/wp2txt/section_extractor.rb +354 -0
data/lib/wp2txt/stream_processor.rb +271 -0
data/lib/wp2txt/template_expander.rb +830 -0
data/lib/wp2txt/text_processing.rb +337 -0
data/lib/wp2txt/utils.rb +629 -270
data/lib/wp2txt/version.rb +1 -1
data/lib/wp2txt.rb +53 -26
data/scripts/benchmark_regex.rb +161 -0
data/scripts/fetch_html_entities.rb +94 -0
data/scripts/fetch_language_metadata.rb +180 -0
data/scripts/fetch_mediawiki_data.rb +334 -0
data/scripts/fetch_template_data.rb +186 -0
data/scripts/profile_memory.rb +139 -0
data/spec/article_spec.rb +402 -0
data/spec/auto_download_spec.rb +314 -0
data/spec/bz2_validator_spec.rb +193 -0
data/spec/category_cache_spec.rb +226 -0
data/spec/category_fetcher_spec.rb +504 -0
data/spec/cleanup_spec.rb +197 -0
data/spec/cli_options_spec.rb +678 -0
data/spec/cli_spec.rb +876 -0
data/spec/config_spec.rb +194 -0
data/spec/constants_spec.rb +138 -0
data/spec/file_utils_spec.rb +170 -0
data/spec/fixtures/samples.rb +181 -0
data/spec/formatter_sections_spec.rb +382 -0
data/spec/global_data_cache_spec.rb +186 -0
data/spec/index_cache_spec.rb +210 -0
data/spec/integration_spec.rb +543 -0
data/spec/magic_words_spec.rb +261 -0
data/spec/markers_spec.rb +476 -0
data/spec/memory_monitor_spec.rb +192 -0
data/spec/multistream_spec.rb +690 -0
data/spec/output_writer_spec.rb +400 -0
data/spec/parser_functions_spec.rb +455 -0
data/spec/ractor_worker_spec.rb +197 -0
data/spec/regex_spec.rb +281 -0
data/spec/section_extractor_spec.rb +397 -0
data/spec/spec_helper.rb +63 -0
data/spec/stream_processor_spec.rb +579 -0
data/spec/template_data_spec.rb +246 -0
data/spec/template_expander_spec.rb +472 -0
data/spec/template_processing_spec.rb +217 -0
data/spec/text_processing_spec.rb +312 -0
data/spec/utils_spec.rb +195 -16
data/spec/wp2txt_spec.rb +510 -0
data/wp2txt.gemspec +5 -3
metadata +146 -18
data/.rubocop.yml +0 -80
data/data/output_samples/testdata_en.txt +0 -23002
data/data/output_samples/testdata_en_category.txt +0 -132
data/data/output_samples/testdata_en_summary.txt +0 -1376
data/data/output_samples/testdata_ja.txt +0 -22774
data/data/output_samples/testdata_ja_category.txt +0 -206
data/data/output_samples/testdata_ja_summary.txt +0 -1560
data/data/testdata_en.bz2 +0 -0
data/data/testdata_ja.bz2 +0 -0
data/image/screenshot.png +0 -0

data/spec/wp2txt_spec.rb ADDED Viewed

@@ -0,0 +1,510 @@
+# frozen_string_literal: true
+require_relative "spec_helper"
+require "tempfile"
+require "fileutils"
+RSpec.describe Wp2txt do
+  let(:temp_dir) { Dir.mktmpdir }
+  after do
+    FileUtils.rm_rf(temp_dir) if temp_dir && Dir.exist?(temp_dir)
+  end
+  describe Wp2txt::Splitter do
+    let(:sample_xml) do
+      <<~XML
+        <mediawiki>
+        <page>
+          <title>Test Article 1</title>
+          <text>'''Test''' is a [[test]].</text>
+        </page>
+        <page>
+          <title>Test Article 2</title>
+          <text>Another '''article''' with [[links]].</text>
+        </page>
+        </mediawiki>
+      XML
+    end
+    let(:xml_file) do
+      file = File.join(temp_dir, "test_input.xml")
+      File.write(file, sample_xml)
+      file
+    end
+    describe "#initialize" do
+      it "creates a splitter with default parameters" do
+        splitter = Wp2txt::Splitter.new(xml_file, temp_dir)
+        expect(splitter).to be_a(Wp2txt::Splitter)
+      end
+      it "creates output file base from input file" do
+        splitter = Wp2txt::Splitter.new(xml_file, temp_dir)
+        expect(splitter.instance_variable_get(:@outfile_base)).to eq("test_input-")
+      end
+    end
+    describe "#command_exist?" do
+      let(:splitter) { Wp2txt::Splitter.new(xml_file, temp_dir) }
+      it "returns path for existing command" do
+        # 'ls' should exist on all Unix systems
+        result = suppress_stdout { splitter.command_exist?("ls") }
+        expect(result).to be_truthy
+        expect(result).to include("ls")
+      end
+      it "returns false for non-existing command" do
+        result = suppress_stdout { splitter.command_exist?("nonexistent_command_xyz123") }
+        expect(result).to be false
+      end
+    end
+    describe "#get_newline" do
+      let(:splitter) { Wp2txt::Splitter.new(xml_file, temp_dir) }
+      it "reads lines from file" do
+        # Reset buffer for testing
+        splitter.instance_variable_set(:@buffer, [+""])
+        line = splitter.get_newline
+        expect(line).to be_a(String)
+      end
+    end
+    describe "#split_file" do
+      it "splits XML file and creates output files" do
+        splitter = Wp2txt::Splitter.new(xml_file, temp_dir, 1) # 1MB split size
+        splitter.split_file
+        outfiles = splitter.instance_variable_get(:@outfiles)
+        expect(outfiles).not_to be_empty
+        # Check that output files were created and renamed to .xml
+        outfiles.each do |f|
+          xml_file_path = f.sub(/\d+$/, "") + "*.xml"
+          matching_files = Dir.glob(File.join(temp_dir, "*.xml"))
+          expect(matching_files).not_to be_empty
+        end
+      end
+    end
+    describe "#fill_buffer" do
+      let(:splitter) { Wp2txt::Splitter.new(xml_file, temp_dir) }
+      it "fills buffer with content from file" do
+        splitter.instance_variable_set(:@buffer, [+""])
+        result = splitter.fill_buffer
+        buffer = splitter.instance_variable_get(:@buffer)
+        expect(result).to be true
+        expect(buffer.size).to be >= 1
+      end
+    end
+  end
+  describe Wp2txt::Runner do
+    let(:sample_xml) do
+      <<~XML
+        <page>
+          <title>Test Article</title>
+          <revision>
+            <text>'''Test Article''' is about [[testing]].
+        == Section ==
+        This is content.
+        [[Category:Testing]]
+            </text>
+          </revision>
+        </page>
+      XML
+    end
+    let(:xml_file) do
+      file = File.join(temp_dir, "test_runner.xml")
+      File.write(file, sample_xml)
+      file
+    end
+    describe "#initialize" do
+      it "creates a runner" do
+        runner = Wp2txt::Runner.new(xml_file, temp_dir, false, false)
+        expect(runner).to be_a(Wp2txt::Runner)
+      end
+    end
+    describe "#prepare" do
+      it "sets up file pointer and output base" do
+        runner = Wp2txt::Runner.new(xml_file, temp_dir, false, false)
+        expect(runner.instance_variable_get(:@outfile_base)).to eq("test_runner")
+        expect(runner.instance_variable_get(:@file_pointer)).not_to be_nil
+      end
+    end
+    describe "#get_newline" do
+      let(:runner) { Wp2txt::Runner.new(xml_file, temp_dir, false, false) }
+      it "returns lines from file" do
+        runner.instance_variable_set(:@buffer, [+""])
+        line = runner.get_newline
+        expect(line).to be_a(String)
+      end
+    end
+    describe "#fill_buffer" do
+      let(:runner) { Wp2txt::Runner.new(xml_file, temp_dir, false, false) }
+      it "reads content into buffer" do
+        runner.instance_variable_set(:@buffer, [+""])
+        result = runner.fill_buffer
+        expect(result).to be true
+      end
+    end
+    describe "#get_page" do
+      let(:runner) { Wp2txt::Runner.new(xml_file, temp_dir, false, false) }
+      it "extracts page content" do
+        page = runner.get_page
+        expect(page).to be_a(String)
+        expect(page).to include("<page>")
+        expect(page).to include("</page>")
+        expect(page).to include("Test Article")
+      end
+      it "returns false when no more pages" do
+        runner.get_page # consume first page
+        result = runner.get_page
+        expect(result).to be false
+      end
+    end
+    describe "#extract_text" do
+      let(:multi_page_xml) do
+        <<~XML
+          <page>
+            <title>Article One</title>
+            <revision>
+              <text>'''Article One''' is first.</text>
+            </revision>
+          </page>
+        XML
+      end
+      let(:multi_page_file) do
+        file = File.join(temp_dir, "multi_page.xml")
+        File.write(file, multi_page_xml)
+        file
+      end
+      it "processes pages and calls block for each article" do
+        runner = Wp2txt::Runner.new(multi_page_file, temp_dir, false, false)
+        articles_processed = []
+        runner.extract_text do |article|
+          articles_processed << article.title
+          "processed: #{article.title}\n"
+        end
+        expect(articles_processed).to include("Article One")
+        # Check output file was created
+        output_file = File.join(temp_dir, "multi_page.txt")
+        expect(File.exist?(output_file)).to be true
+      end
+    end
+  end
+  describe "Module methods" do
+    include Wp2txt
+    describe "#rename" do
+      it "renames files with extension" do
+        # Create test files
+        files = []
+        3.times do |i|
+          f = File.join(temp_dir, "testfile#{i}")
+          File.write(f, "content #{i}")
+          files << f
+        end
+        rename(files, "txt")
+        files.each_with_index do |f, i|
+          new_name = "#{f}.txt"
+          expect(File.exist?(new_name)).to be true
+          expect(File.read(new_name)).to eq("content #{i}")
+        end
+      end
+    end
+  end
+end
+RSpec.describe "Splitter with edge cases" do
+  let(:temp_dir) { Dir.mktmpdir }
+  after do
+    FileUtils.rm_rf(temp_dir) if temp_dir && Dir.exist?(temp_dir)
+  end
+  describe "empty file handling" do
+    let(:empty_file) do
+      file = File.join(temp_dir, "empty.xml")
+      File.write(file, "")
+      file
+    end
+    it "handles empty input file" do
+      expect {
+        splitter = Wp2txt::Splitter.new(empty_file, temp_dir)
+        splitter.split_file
+      }.not_to raise_error
+    end
+  end
+  describe "large content handling" do
+    let(:large_xml) do
+      content = +"<mediawiki>\n"
+      50.times do |i|
+        content << "<page>\n"
+        content << "  <title>Article #{i}</title>\n"
+        content << "  <text>#{'x' * 1000} article #{i}</text>\n"
+        content << "</page>\n"
+      end
+      content << "</mediawiki>"
+      content
+    end
+    let(:large_file) do
+      file = File.join(temp_dir, "large.xml")
+      File.write(file, large_xml)
+      file
+    end
+    it "processes large files without error" do
+      expect {
+        splitter = Wp2txt::Splitter.new(large_file, temp_dir, 1)
+        splitter.split_file
+      }.not_to raise_error
+    end
+  end
+end
+RSpec.describe "Splitter additional tests" do
+  let(:temp_dir) { Dir.mktmpdir }
+  after do
+    FileUtils.rm_rf(temp_dir) if temp_dir && Dir.exist?(temp_dir)
+  end
+  describe "#file_size" do
+    let(:test_file) do
+      file = File.join(temp_dir, "test_size.xml")
+      File.write(file, "x" * 1000)
+      file
+    end
+    it "calculates file size" do
+      splitter = Wp2txt::Splitter.new(test_file, temp_dir)
+      size = splitter.file_size(File.open(test_file, "r"))
+      expect(size).to eq(1000)
+    end
+    it "handles empty file" do
+      empty_file = File.join(temp_dir, "empty.xml")
+      File.write(empty_file, "")
+      splitter = Wp2txt::Splitter.new(empty_file, temp_dir)
+      size = splitter.file_size(File.open(empty_file, "r"))
+      expect(size).to eq(0)
+    end
+  end
+  describe "#split_file edge cases" do
+    let(:single_page_xml) do
+      <<~XML
+        <mediawiki>
+        <page>
+          <title>Single Article</title>
+          <text>Content here.</text>
+        </page>
+        </mediawiki>
+      XML
+    end
+    let(:single_file) do
+      file = File.join(temp_dir, "single.xml")
+      File.write(file, single_page_xml)
+      file
+    end
+    it "handles single page file" do
+      splitter = Wp2txt::Splitter.new(single_file, temp_dir)
+      splitter.split_file
+      xml_files = Dir.glob(File.join(temp_dir, "*.xml"))
+      expect(xml_files.size).to be >= 1
+    end
+    it "creates output files with correct base name" do
+      splitter = Wp2txt::Splitter.new(single_file, temp_dir)
+      splitter.split_file
+      xml_files = Dir.glob(File.join(temp_dir, "single-*.xml"))
+      expect(xml_files).not_to be_empty
+    end
+  end
+  describe "#prepare" do
+    it "sets up file pointer for plain XML" do
+      xml_file = File.join(temp_dir, "test.xml")
+      File.write(xml_file, "<page></page>")
+      splitter = Wp2txt::Splitter.new(xml_file, temp_dir)
+      expect(splitter.instance_variable_get(:@file_pointer)).not_to be_nil
+      expect(splitter.instance_variable_get(:@outfile_base)).to eq("test-")
+    end
+  end
+end
+RSpec.describe "Runner additional tests" do
+  let(:temp_dir) { Dir.mktmpdir }
+  after do
+    FileUtils.rm_rf(temp_dir) if temp_dir && Dir.exist?(temp_dir)
+  end
+  describe "#extract_text with del_interfile" do
+    let(:xml_content) do
+      <<~XML
+        <page>
+          <title>Delete Test</title>
+          <revision>
+            <text>Test content.</text>
+          </revision>
+        </page>
+      XML
+    end
+    it "deletes intermediate file when del_interfile is true" do
+      xml_file = File.join(temp_dir, "to_delete.xml")
+      File.write(xml_file, xml_content)
+      runner = Wp2txt::Runner.new(xml_file, temp_dir, false, true)
+      runner.extract_text { |article| "#{article.title}\n" }
+      expect(File.exist?(xml_file)).to be false
+    end
+    it "keeps intermediate file when del_interfile is false" do
+      xml_file = File.join(temp_dir, "to_keep.xml")
+      File.write(xml_file, xml_content)
+      runner = Wp2txt::Runner.new(xml_file, temp_dir, false, false)
+      runner.extract_text { |article| "#{article.title}\n" }
+      expect(File.exist?(xml_file)).to be true
+    end
+  end
+  describe "#get_page edge cases" do
+    let(:incomplete_xml) do
+      <<~XML
+        <page>
+          <title>Incomplete</title>
+          <text>No closing page tag
+      XML
+    end
+    it "handles incomplete page" do
+      xml_file = File.join(temp_dir, "incomplete.xml")
+      File.write(xml_file, incomplete_xml)
+      runner = Wp2txt::Runner.new(xml_file, temp_dir, false, false)
+      result = runner.get_page
+      # Should return something even if incomplete
+      expect(result).to be_truthy
+    end
+  end
+end
+RSpec.describe "Runner edge cases" do
+  let(:temp_dir) { Dir.mktmpdir }
+  after do
+    FileUtils.rm_rf(temp_dir) if temp_dir && Dir.exist?(temp_dir)
+  end
+  describe "page with colon in title" do
+    let(:colon_title_xml) do
+      <<~XML
+        <page>
+          <title>Category:Test</title>
+          <revision>
+            <text>Category page content</text>
+          </revision>
+        </page>
+        <page>
+          <title>Normal Article</title>
+          <revision>
+            <text>Normal content</text>
+          </revision>
+        </page>
+      XML
+    end
+    let(:colon_file) do
+      file = File.join(temp_dir, "colon_test.xml")
+      File.write(file, colon_title_xml)
+      file
+    end
+    it "skips pages with colon in title (namespace pages)" do
+      runner = Wp2txt::Runner.new(colon_file, temp_dir, false, false)
+      titles = []
+      runner.extract_text do |article|
+        titles << article.title
+        "#{article.title}\n"
+      end
+      expect(titles).to include("Normal Article")
+      expect(titles).not_to include("Category:Test")
+    end
+  end
+  describe "page with HTML comments" do
+    let(:comment_xml) do
+      <<~XML
+        <page>
+          <title>Comment Test</title>
+          <revision>
+            <text>Before comment <!-- hidden
+        multiline
+        comment --> after comment</text>
+          </revision>
+        </page>
+      XML
+    end
+    let(:comment_file) do
+      file = File.join(temp_dir, "comment_test.xml")
+      File.write(file, comment_xml)
+      file
+    end
+    it "removes HTML comments preserving newlines" do
+      runner = Wp2txt::Runner.new(comment_file, temp_dir, false, false)
+      content = ""
+      runner.extract_text do |article|
+        content = article.elements.map { |e| e.last }.join("\n")
+        content
+      end
+      expect(content).to include("Before comment")
+      expect(content).to include("after comment")
+      expect(content).not_to include("hidden")
+    end
+  end
+end

data/wp2txt.gemspec CHANGED Viewed

@@ -10,9 +10,8 @@ Gem::Specification.new do |s|
   s.homepage    = "https://github.com/yohasebe/wp2txt"
   s.summary     = "A command-line toolkit to extract text content and category data from Wikipedia dump files"
   s.description = "WP2TXT extracts text and category data from Wikipedia dump files (encoded in XML / compressed with Bzip2), removing MediaWiki markup and other metadata."
-  s.rubyforge_project = "wp2txt"
   s.license = "MIT"
-  s.required_ruby_version = Gem::Requirement.new(">= 2.6")
+  s.required_ruby_version = Gem::Requirement.new(">= 3.0")
   s.files = `git ls-files`.split("\n")
   s.files -= ["data/*", "image/*"]
   s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
@@ -21,11 +20,14 @@ Gem::Specification.new do |s|
   s.add_development_dependency "bundler"
   s.add_development_dependency "rake"
   s.add_development_dependency "rspec"
+  s.add_development_dependency "simplecov"
+  s.add_development_dependency "webmock"
   s.add_dependency "htmlentities"
   s.add_dependency "nokogiri"
   s.add_dependency "optimist"
   s.add_dependency "parallel"
   s.add_dependency "pastel"
-  s.add_dependency "ruby-progressbar"
+  s.add_dependency "tty-progressbar"
   s.add_dependency "tty-spinner"
+  s.add_dependency "sqlite3"
 end