RubyGems - wp2txt - Versions diffs - 1.1.3 → 2.1.0 - Mend

wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

checksums.yaml +4 -4
data/.dockerignore +12 -0
data/.github/workflows/ci.yml +13 -13
data/.gitignore +14 -0
data/CHANGELOG.md +284 -0
data/DEVELOPMENT.md +415 -0
data/DEVELOPMENT_ja.md +415 -0
data/Dockerfile +19 -10
data/Gemfile +2 -8
data/README.md +259 -123
data/README_ja.md +375 -0
data/Rakefile +4 -0
data/bin/wp2txt +863 -161
data/lib/wp2txt/article.rb +98 -13
data/lib/wp2txt/bz2_validator.rb +239 -0
data/lib/wp2txt/category_cache.rb +313 -0
data/lib/wp2txt/cli.rb +319 -0
data/lib/wp2txt/cli_ui.rb +428 -0
data/lib/wp2txt/config.rb +158 -0
data/lib/wp2txt/constants.rb +134 -0
data/lib/wp2txt/data/html_entities.json +2135 -0
data/lib/wp2txt/data/language_metadata.json +4769 -0
data/lib/wp2txt/data/language_tiers.json +59 -0
data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
data/lib/wp2txt/data/template_aliases.json +193 -0
data/lib/wp2txt/data/wikipedia_entities.json +12 -0
data/lib/wp2txt/extractor.rb +545 -0
data/lib/wp2txt/file_utils.rb +91 -0
data/lib/wp2txt/formatter.rb +352 -0
data/lib/wp2txt/global_data_cache.rb +353 -0
data/lib/wp2txt/index_cache.rb +258 -0
data/lib/wp2txt/magic_words.rb +353 -0
data/lib/wp2txt/memory_monitor.rb +236 -0
data/lib/wp2txt/multistream.rb +1383 -0
data/lib/wp2txt/output_writer.rb +182 -0
data/lib/wp2txt/parser_functions.rb +606 -0
data/lib/wp2txt/ractor_worker.rb +215 -0
data/lib/wp2txt/regex.rb +396 -12
data/lib/wp2txt/section_extractor.rb +354 -0
data/lib/wp2txt/stream_processor.rb +271 -0
data/lib/wp2txt/template_expander.rb +830 -0
data/lib/wp2txt/text_processing.rb +337 -0
data/lib/wp2txt/utils.rb +629 -270
data/lib/wp2txt/version.rb +1 -1
data/lib/wp2txt.rb +53 -26
data/scripts/benchmark_regex.rb +161 -0
data/scripts/fetch_html_entities.rb +94 -0
data/scripts/fetch_language_metadata.rb +180 -0
data/scripts/fetch_mediawiki_data.rb +334 -0
data/scripts/fetch_template_data.rb +186 -0
data/scripts/profile_memory.rb +139 -0
data/spec/article_spec.rb +402 -0
data/spec/auto_download_spec.rb +314 -0
data/spec/bz2_validator_spec.rb +193 -0
data/spec/category_cache_spec.rb +226 -0
data/spec/category_fetcher_spec.rb +504 -0
data/spec/cleanup_spec.rb +197 -0
data/spec/cli_options_spec.rb +678 -0
data/spec/cli_spec.rb +876 -0
data/spec/config_spec.rb +194 -0
data/spec/constants_spec.rb +138 -0
data/spec/file_utils_spec.rb +170 -0
data/spec/fixtures/samples.rb +181 -0
data/spec/formatter_sections_spec.rb +382 -0
data/spec/global_data_cache_spec.rb +186 -0
data/spec/index_cache_spec.rb +210 -0
data/spec/integration_spec.rb +543 -0
data/spec/magic_words_spec.rb +261 -0
data/spec/markers_spec.rb +476 -0
data/spec/memory_monitor_spec.rb +192 -0
data/spec/multistream_spec.rb +690 -0
data/spec/output_writer_spec.rb +400 -0
data/spec/parser_functions_spec.rb +455 -0
data/spec/ractor_worker_spec.rb +197 -0
data/spec/regex_spec.rb +281 -0
data/spec/section_extractor_spec.rb +397 -0
data/spec/spec_helper.rb +63 -0
data/spec/stream_processor_spec.rb +579 -0
data/spec/template_data_spec.rb +246 -0
data/spec/template_expander_spec.rb +472 -0
data/spec/template_processing_spec.rb +217 -0
data/spec/text_processing_spec.rb +312 -0
data/spec/utils_spec.rb +195 -16
data/spec/wp2txt_spec.rb +510 -0
data/wp2txt.gemspec +5 -3
metadata +146 -18
data/.rubocop.yml +0 -80
data/data/output_samples/testdata_en.txt +0 -23002
data/data/output_samples/testdata_en_category.txt +0 -132
data/data/output_samples/testdata_en_summary.txt +0 -1376
data/data/output_samples/testdata_ja.txt +0 -22774
data/data/output_samples/testdata_ja_category.txt +0 -206
data/data/output_samples/testdata_ja_summary.txt +0 -1560
data/data/testdata_en.bz2 +0 -0
data/data/testdata_ja.bz2 +0 -0
data/image/screenshot.png +0 -0

data/spec/article_spec.rb ADDED Viewed

@@ -0,0 +1,402 @@
+# frozen_string_literal: true
+require_relative "spec_helper"
+require_relative "fixtures/samples"
+RSpec.describe Wp2txt::Article do
+  # Use let blocks for lazy evaluation to avoid triggering bugs at load time
+  let(:english_article) { Wp2txt::TestSamples::ENGLISH_ARTICLE }
+  let(:japanese_article) { Wp2txt::TestSamples::JAPANESE_ARTICLE }
+  let(:german_article) { Wp2txt::TestSamples::GERMAN_ARTICLE }
+  let(:french_article) { Wp2txt::TestSamples::FRENCH_ARTICLE }
+  let(:chinese_article) { Wp2txt::TestSamples::CHINESE_ARTICLE }
+  let(:russian_article) { Wp2txt::TestSamples::RUSSIAN_ARTICLE }
+  let(:korean_article) { Wp2txt::TestSamples::KOREAN_ARTICLE }
+  let(:arabic_article) { Wp2txt::TestSamples::ARABIC_ARTICLE }
+  let(:emoji_content) { Wp2txt::TestSamples::EMOJI_CONTENT }
+  let(:deeply_nested) { Wp2txt::TestSamples::DEEPLY_NESTED }
+  let(:malformed_markup) { Wp2txt::TestSamples::MALFORMED_MARKUP }
+  let(:nested_templates) { Wp2txt::TestSamples::NESTED_TEMPLATES }
+  let(:table_content) { Wp2txt::TestSamples::TABLE_CONTENT }
+  let(:reference_content) { Wp2txt::TestSamples::REFERENCE_CONTENT }
+  let(:multiline_link) { Wp2txt::TestSamples::MULTILINE_LINK }
+  describe "#parse" do
+    it "classifies headings correctly" do
+      article = Wp2txt::Article.new("== Heading ==\nParagraph text")
+      types = article.elements.map(&:first)
+      expect(types).to include(:mw_heading)
+      expect(types).to include(:mw_paragraph)
+    end
+    it "classifies unordered lists" do
+      article = Wp2txt::Article.new("* Item 1\n* Item 2\n* Item 3")
+      types = article.elements.map(&:first)
+      expect(types.count(:mw_unordered)).to eq 3
+    end
+    it "classifies ordered lists" do
+      article = Wp2txt::Article.new("# First\n# Second\n# Third")
+      types = article.elements.map(&:first)
+      expect(types.count(:mw_ordered)).to eq 3
+    end
+    it "classifies definition lists" do
+      article = Wp2txt::Article.new("; Term\n: Definition")
+      types = article.elements.map(&:first)
+      expect(types).to include(:mw_definition)
+    end
+    it "classifies blank lines" do
+      article = Wp2txt::Article.new("Text\n\nMore text")
+      types = article.elements.map(&:first)
+      expect(types).to include(:mw_blank)
+    end
+    it "handles multi-line templates" do
+      article = Wp2txt::Article.new(nested_templates)
+      types = article.elements.map(&:first)
+      expect(types).to include(:mw_ml_template)
+    end
+    it "handles table content" do
+      article = Wp2txt::Article.new(table_content)
+      types = article.elements.map(&:first)
+      expect(types).to include(:mw_table)
+    end
+    it "detects redirects" do
+      article = Wp2txt::Article.new("#REDIRECT [[Other Page]]")
+      types = article.elements.map(&:first)
+      expect(types).to include(:mw_redirect)
+    end
+  end
+  describe "#categories" do
+    it "extracts English categories" do
+      article = Wp2txt::Article.new(english_article)
+      categories = article.categories.flatten
+      expect(categories).to include("Tests")
+    end
+    # Tests for multilingual category extraction
+    # Will fail until multilingual support is implemented
+    # it "extracts Japanese categories" do
+    #   article = Wp2txt::Article.new(japanese_article)
+    #   categories = article.categories.flatten
+    #   expect(categories).to include("テスト")
+    # end
+    it "extracts multiple categories from one article" do
+      article = Wp2txt::Article.new(english_article)
+      categories = article.categories.flatten
+      expect(categories.size).to be >= 1
+    end
+  end
+  describe "edge cases" do
+    it "handles malformed markup gracefully" do
+      # This test exposes the exit bug in convert_characters
+      expect { Wp2txt::Article.new(malformed_markup) }.not_to raise_error
+    end
+    it "handles deeply nested templates" do
+      # This test exposes the exit bug in convert_characters
+      expect { Wp2txt::Article.new(deeply_nested) }.not_to raise_error
+    end
+    it "handles empty input" do
+      article = Wp2txt::Article.new("")
+      expect(article.elements).to be_empty
+    end
+    it "handles whitespace-only input" do
+      article = Wp2txt::Article.new("   \n   \n   ")
+      expect { article }.not_to raise_error
+    end
+  end
+  describe "title handling" do
+    it "stores the article title" do
+      article = Wp2txt::Article.new("Content", "Test Title")
+      expect(article.title).to eq "Test Title"
+    end
+    it "strips whitespace from title" do
+      article = Wp2txt::Article.new("Content", "  Title  ")
+      expect(article.title).to eq "Title"
+    end
+  end
+  describe "multilingual content" do
+    it "handles Japanese content" do
+      expect { Wp2txt::Article.new(japanese_article) }.not_to raise_error
+    end
+    it "handles German content" do
+      expect { Wp2txt::Article.new(german_article) }.not_to raise_error
+    end
+    it "handles Chinese content" do
+      expect { Wp2txt::Article.new(chinese_article) }.not_to raise_error
+    end
+    it "handles Russian content" do
+      expect { Wp2txt::Article.new(russian_article) }.not_to raise_error
+    end
+    it "handles Korean content" do
+      expect { Wp2txt::Article.new(korean_article) }.not_to raise_error
+    end
+    it "handles Arabic content" do
+      expect { Wp2txt::Article.new(arabic_article) }.not_to raise_error
+    end
+  end
+  describe "multiline structures" do
+    it "handles multiline templates" do
+      wiki = "{{Infobox\n|name = Test\n|value = 123\n}}"
+      article = Wp2txt::Article.new(wiki)
+      types = article.elements.map(&:first)
+      expect(types).to include(:mw_ml_template)
+    end
+    it "extracts content after closing }} on same line" do
+      wiki = "{{Template\n|param = value\n}}Following paragraph text."
+      article = Wp2txt::Article.new(wiki)
+      types = article.elements.map(&:first)
+      expect(types).to include(:mw_ml_template)
+      expect(types).to include(:mw_paragraph)
+      # Check that the paragraph content is extracted
+      paragraph = article.elements.find { |t, _| t == :mw_paragraph }
+      expect(paragraph.last).to include("Following paragraph text")
+    end
+    it "handles nested braces in multiline templates" do
+      wiki = "{{Outer\n|inner = {{nested}}\n}}After template."
+      article = Wp2txt::Article.new(wiki)
+      types = article.elements.map(&:first)
+      expect(types).to include(:mw_ml_template)
+      paragraph = article.elements.find { |t, _| t == :mw_paragraph }
+      expect(paragraph.last).to include("After template")
+    end
+    it "handles multiline links" do
+      wiki = "[[File:Image.jpg|thumb|Description\nthat spans\nmultiple lines]]"
+      article = Wp2txt::Article.new(wiki)
+      types = article.elements.map(&:first)
+      expect(types).to include(:mw_ml_link)
+    end
+    it "handles source code blocks" do
+      wiki = "<source lang=\"ruby\">\ndef hello\n  puts 'world'\nend\n</source>"
+      article = Wp2txt::Article.new(wiki)
+      types = article.elements.map(&:first)
+      expect(types).to include(:mw_source)
+    end
+    it "handles multiline source blocks starting mid-content" do
+      # Source block that starts in middle of content
+      wiki = "text before\n<source lang=\"ruby\">\ncode here\n</source>\ntext after"
+      article = Wp2txt::Article.new(wiki)
+      types = article.elements.map(&:first)
+      expect(types).to include(:mw_source)
+    end
+    it "handles math blocks" do
+      wiki = "<math>\nx = \\frac{-b \\pm \\sqrt{b^2-4ac}}{2a}\n</math>"
+      article = Wp2txt::Article.new(wiki)
+      types = article.elements.map(&:first)
+      expect(types).to include(:mw_math)
+    end
+    it "handles single-line math blocks with content" do
+      wiki = "formula: <math>E = mc^2</math> explained"
+      article = Wp2txt::Article.new(wiki)
+      types = article.elements.map(&:first)
+      expect(types).to include(:mw_math)
+    end
+    it "handles inputbox blocks" do
+      wiki = "<inputbox>\ntype=search\nwidth=30\n</inputbox>"
+      article = Wp2txt::Article.new(wiki)
+      types = article.elements.map(&:first)
+      expect(types).to include(:mw_inputbox)
+    end
+    it "handles single-line inputbox with content" do
+      wiki = "search: <inputbox>type=search</inputbox> here"
+      article = Wp2txt::Article.new(wiki)
+      types = article.elements.map(&:first)
+      expect(types).to include(:mw_inputbox)
+    end
+    it "handles HTML tables" do
+      wiki = "<table>\n<tr><td>Cell</td></tr>\n</table>"
+      article = Wp2txt::Article.new(wiki)
+      types = article.elements.map(&:first)
+      expect(types).to include(:mw_htable)
+    end
+    it "handles single-line HTML tables with content" do
+      wiki = "data: <table><tr><td>x</td></tr></table> end"
+      article = Wp2txt::Article.new(wiki)
+      types = article.elements.map(&:first)
+      expect(types).to include(:mw_htable)
+    end
+  end
+  describe "pre-formatted text" do
+    it "classifies pre-formatted text" do
+      article = Wp2txt::Article.new(" preformatted text")
+      types = article.elements.map(&:first)
+      expect(types).to include(:mw_pre)
+    end
+  end
+  describe "strip_tmarker option" do
+    it "strips list markers when enabled" do
+      article = Wp2txt::Article.new("* List item", "", true)
+      content = article.elements.find { |e| e.first == :mw_unordered }&.last
+      expect(content).not_to start_with("*")
+    end
+    it "preserves list markers when disabled" do
+      article = Wp2txt::Article.new("* List item", "", false)
+      content = article.elements.find { |e| e.first == :mw_unordered }&.last
+      expect(content).to start_with("*")
+    end
+    it "strips definition markers when enabled" do
+      article = Wp2txt::Article.new(": Definition", "", true)
+      content = article.elements.find { |e| e.first == :mw_definition }&.last
+      expect(content).not_to start_with(":")
+    end
+    it "strips pre markers when enabled" do
+      article = Wp2txt::Article.new(" preformatted", "", true)
+      content = article.elements.find { |e| e.first == :mw_pre }&.last
+      # Pre marker is the leading space; when stripped, content should not have it
+      expect(content&.strip).to eq("preformatted")
+    end
+    it "strips ordered list markers when enabled" do
+      article = Wp2txt::Article.new("# Numbered", "", true)
+      content = article.elements.find { |e| e.first == :mw_ordered }&.last
+      expect(content).not_to start_with("#")
+    end
+  end
+  describe "isolated elements" do
+    it "detects isolated templates" do
+      article = Wp2txt::Article.new("{{stub}}")
+      types = article.elements.map(&:first)
+      expect(types).to include(:mw_isolated_template)
+    end
+    it "detects isolated tags with content" do
+      # ISOLATED_TAG_REGEX matches tags with content between them
+      # Using <span> which is not removed by remove_html
+      article = Wp2txt::Article.new("<span>content</span>")
+      types = article.elements.map(&:first)
+      expect(types).to include(:mw_isolated_tag)
+    end
+  end
+  describe "link handling" do
+    it "detects standalone link lines" do
+      article = Wp2txt::Article.new("[[Link Target]]")
+      types = article.elements.map(&:first)
+      expect(types).to include(:mw_link)
+    end
+  end
+  describe "multilingual redirects" do
+    it "detects German redirect" do
+      article = Wp2txt::Article.new("#WEITERLEITUNG [[Ziel]]")
+      types = article.elements.map(&:first)
+      expect(types).to include(:mw_redirect)
+    end
+    it "detects French redirect" do
+      article = Wp2txt::Article.new("#REDIRECTION [[Cible]]")
+      types = article.elements.map(&:first)
+      expect(types).to include(:mw_redirect)
+    end
+    it "detects Japanese redirect" do
+      article = Wp2txt::Article.new("#転送 [[転送先]]")
+      types = article.elements.map(&:first)
+      expect(types).to include(:mw_redirect)
+    end
+    it "detects Russian redirect" do
+      article = Wp2txt::Article.new("#ПЕРЕНАПРАВЛЕНИЕ [[Цель]]")
+      types = article.elements.map(&:first)
+      expect(types).to include(:mw_redirect)
+    end
+    it "detects Chinese redirect" do
+      article = Wp2txt::Article.new("#重定向 [[目标]]")
+      types = article.elements.map(&:first)
+      expect(types).to include(:mw_redirect)
+    end
+    it "detects Japanese alternative redirect (リダイレクト)" do
+      article = Wp2txt::Article.new("#リダイレクト [[転送先]]")
+      types = article.elements.map(&:first)
+      expect(types).to include(:mw_redirect)
+    end
+    it "detects Russian abbreviated redirect (перенапр)" do
+      article = Wp2txt::Article.new("#перенапр [[Цель]]")
+      types = article.elements.map(&:first)
+      expect(types).to include(:mw_redirect)
+    end
+    it "detects Hindi redirect (पुनर्प्रेषित)" do
+      article = Wp2txt::Article.new("#पुनर्प्रेषित [[लक्ष्य]]")
+      types = article.elements.map(&:first)
+      expect(types).to include(:mw_redirect)
+    end
+  end
+  describe "multilingual categories" do
+    it "extracts Japanese categories" do
+      article = Wp2txt::Article.new("[[カテゴリ:テスト]]")
+      categories = article.categories.flatten
+      expect(categories).to include("テスト")
+    end
+    it "extracts German categories" do
+      article = Wp2txt::Article.new("[[Kategorie:Test]]")
+      categories = article.categories.flatten
+      expect(categories).to include("Test")
+    end
+    it "extracts French categories" do
+      article = Wp2txt::Article.new("[[Catégorie:Test]]")
+      categories = article.categories.flatten
+      expect(categories).to include("Test")
+    end
+    it "extracts Russian categories" do
+      article = Wp2txt::Article.new("[[Категория:Тест]]")
+      categories = article.categories.flatten
+      expect(categories).to include("Тест")
+    end
+    it "extracts Chinese simplified categories" do
+      article = Wp2txt::Article.new("[[分类:测试]]")
+      categories = article.categories.flatten
+      expect(categories).to include("测试")
+    end
+    it "extracts Chinese traditional categories" do
+      article = Wp2txt::Article.new("[[分類:測試]]")
+      categories = article.categories.flatten
+      expect(categories).to include("測試")
+    end
+  end
+end