RubyGems - wp2txt - Versions diffs - 1.1.3 → 2.1.0 - Mend

wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

checksums.yaml +4 -4
data/.dockerignore +12 -0
data/.github/workflows/ci.yml +13 -13
data/.gitignore +14 -0
data/CHANGELOG.md +284 -0
data/DEVELOPMENT.md +415 -0
data/DEVELOPMENT_ja.md +415 -0
data/Dockerfile +19 -10
data/Gemfile +2 -8
data/README.md +259 -123
data/README_ja.md +375 -0
data/Rakefile +4 -0
data/bin/wp2txt +863 -161
data/lib/wp2txt/article.rb +98 -13
data/lib/wp2txt/bz2_validator.rb +239 -0
data/lib/wp2txt/category_cache.rb +313 -0
data/lib/wp2txt/cli.rb +319 -0
data/lib/wp2txt/cli_ui.rb +428 -0
data/lib/wp2txt/config.rb +158 -0
data/lib/wp2txt/constants.rb +134 -0
data/lib/wp2txt/data/html_entities.json +2135 -0
data/lib/wp2txt/data/language_metadata.json +4769 -0
data/lib/wp2txt/data/language_tiers.json +59 -0
data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
data/lib/wp2txt/data/template_aliases.json +193 -0
data/lib/wp2txt/data/wikipedia_entities.json +12 -0
data/lib/wp2txt/extractor.rb +545 -0
data/lib/wp2txt/file_utils.rb +91 -0
data/lib/wp2txt/formatter.rb +352 -0
data/lib/wp2txt/global_data_cache.rb +353 -0
data/lib/wp2txt/index_cache.rb +258 -0
data/lib/wp2txt/magic_words.rb +353 -0
data/lib/wp2txt/memory_monitor.rb +236 -0
data/lib/wp2txt/multistream.rb +1383 -0
data/lib/wp2txt/output_writer.rb +182 -0
data/lib/wp2txt/parser_functions.rb +606 -0
data/lib/wp2txt/ractor_worker.rb +215 -0
data/lib/wp2txt/regex.rb +396 -12
data/lib/wp2txt/section_extractor.rb +354 -0
data/lib/wp2txt/stream_processor.rb +271 -0
data/lib/wp2txt/template_expander.rb +830 -0
data/lib/wp2txt/text_processing.rb +337 -0
data/lib/wp2txt/utils.rb +629 -270
data/lib/wp2txt/version.rb +1 -1
data/lib/wp2txt.rb +53 -26
data/scripts/benchmark_regex.rb +161 -0
data/scripts/fetch_html_entities.rb +94 -0
data/scripts/fetch_language_metadata.rb +180 -0
data/scripts/fetch_mediawiki_data.rb +334 -0
data/scripts/fetch_template_data.rb +186 -0
data/scripts/profile_memory.rb +139 -0
data/spec/article_spec.rb +402 -0
data/spec/auto_download_spec.rb +314 -0
data/spec/bz2_validator_spec.rb +193 -0
data/spec/category_cache_spec.rb +226 -0
data/spec/category_fetcher_spec.rb +504 -0
data/spec/cleanup_spec.rb +197 -0
data/spec/cli_options_spec.rb +678 -0
data/spec/cli_spec.rb +876 -0
data/spec/config_spec.rb +194 -0
data/spec/constants_spec.rb +138 -0
data/spec/file_utils_spec.rb +170 -0
data/spec/fixtures/samples.rb +181 -0
data/spec/formatter_sections_spec.rb +382 -0
data/spec/global_data_cache_spec.rb +186 -0
data/spec/index_cache_spec.rb +210 -0
data/spec/integration_spec.rb +543 -0
data/spec/magic_words_spec.rb +261 -0
data/spec/markers_spec.rb +476 -0
data/spec/memory_monitor_spec.rb +192 -0
data/spec/multistream_spec.rb +690 -0
data/spec/output_writer_spec.rb +400 -0
data/spec/parser_functions_spec.rb +455 -0
data/spec/ractor_worker_spec.rb +197 -0
data/spec/regex_spec.rb +281 -0
data/spec/section_extractor_spec.rb +397 -0
data/spec/spec_helper.rb +63 -0
data/spec/stream_processor_spec.rb +579 -0
data/spec/template_data_spec.rb +246 -0
data/spec/template_expander_spec.rb +472 -0
data/spec/template_processing_spec.rb +217 -0
data/spec/text_processing_spec.rb +312 -0
data/spec/utils_spec.rb +195 -16
data/spec/wp2txt_spec.rb +510 -0
data/wp2txt.gemspec +5 -3
metadata +146 -18
data/.rubocop.yml +0 -80
data/data/output_samples/testdata_en.txt +0 -23002
data/data/output_samples/testdata_en_category.txt +0 -132
data/data/output_samples/testdata_en_summary.txt +0 -1376
data/data/output_samples/testdata_ja.txt +0 -22774
data/data/output_samples/testdata_ja_category.txt +0 -206
data/data/output_samples/testdata_ja_summary.txt +0 -1560
data/data/testdata_en.bz2 +0 -0
data/data/testdata_ja.bz2 +0 -0
data/image/screenshot.png +0 -0

data/spec/text_processing_spec.rb ADDED Viewed

@@ -0,0 +1,312 @@
+# frozen_string_literal: true
+require "spec_helper"
+RSpec.describe "Wp2txt Text Processing" do
+  include Wp2txt
+  describe "convert_characters" do
+    it "handles valid UTF-8 text" do
+      result = convert_characters("Hello World")
+      expect(result).to eq("Hello World")
+    end
+    it "handles Unicode text" do
+      result = convert_characters("日本語テキスト")
+      expect(result).to eq("日本語テキスト")
+    end
+    it "converts HTML entities" do
+      result = convert_characters("Hello &amp; World")
+      expect(result).to eq("Hello & World")
+    end
+    it "handles nil input" do
+      result = convert_characters(nil)
+      expect(result).to eq("")
+    end
+    it "handles numeric character references" do
+      result = convert_characters("&#65;&#66;&#67;")
+      expect(result).to eq("ABC")
+    end
+  end
+  describe "special_chr" do
+    it "decodes HTML entities" do
+      result = special_chr("&amp; &lt; &gt;")
+      expect(result).to eq("& < >")
+    end
+    it "decodes special quotes" do
+      result = special_chr("&ldquo;text&rdquo;")
+      expect(result).to include("text")
+    end
+  end
+  describe "chrref_to_utf" do
+    it "converts decimal character references" do
+      result = chrref_to_utf("&#65;")
+      expect(result).to eq("A")
+    end
+    it "converts hex character references" do
+      result = chrref_to_utf("&#x41;")
+      expect(result).to eq("A")
+    end
+    it "handles Japanese characters" do
+      result = chrref_to_utf("&#12354;")
+      expect(result).to eq("あ")
+    end
+    it "handles invalid codepoints" do
+      result = chrref_to_utf("&#0;")
+      expect(result).to eq("")
+    end
+    it "preserves non-reference text" do
+      result = chrref_to_utf("normal text")
+      expect(result).to eq("normal text")
+    end
+  end
+  describe "mndash" do
+    it "converts ndash template" do
+      result = mndash("1990{{ndash}}2000")
+      # The implementation wraps the dash in braces
+      expect(result).to include("–")
+    end
+    it "handles mdash" do
+      result = mndash("text{{mdash}}more")
+      expect(result).to include("–")
+    end
+    it "preserves text without dashes" do
+      result = mndash("normal text")
+      expect(result).to eq("normal text")
+    end
+  end
+  describe "process_nested_structure" do
+    it "processes simple nested brackets" do
+      result = process_nested_structure("[[test]]", "[[", "]]") do |content|
+        content.upcase
+      end
+      expect(result).to eq("TEST")
+    end
+    it "processes multiple nested levels" do
+      result = process_nested_structure("[[outer [[inner]]]]", "[[", "]]") do |content|
+        "[#{content}]"
+      end
+      # The algorithm processes innermost first, then outer
+      expect(result).to include("[inner]")
+    end
+    it "handles empty content" do
+      result = process_nested_structure("[[]]", "[[", "]]") do |_content|
+        "empty"
+      end
+      expect(result).to eq("empty")
+    end
+    it "preserves text without brackets" do
+      result = process_nested_structure("no brackets here", "[[", "]]") do |_content|
+        "replaced"
+      end
+      expect(result).to eq("no brackets here")
+    end
+    it "handles curly braces" do
+      result = process_nested_structure("{{template}}", "{{", "}}") do |content|
+        "T:#{content}"
+      end
+      expect(result).to eq("T:template")
+    end
+  end
+  describe "escape_nowiki and unescape_nowiki" do
+    it "escapes and unescapes nowiki tags" do
+      original = "text <nowiki>[[preserved]]</nowiki> more"
+      escaped = escape_nowiki(original)
+      expect(escaped).not_to include("[[preserved]]")
+      expect(escaped).to include("<nowiki-")
+      unescaped = unescape_nowiki(escaped)
+      expect(unescaped).to include("[[preserved]]")
+    end
+    it "handles multiple nowiki tags" do
+      original = "<nowiki>a</nowiki> and <nowiki>b</nowiki>"
+      escaped = escape_nowiki(original)
+      expect(escaped.scan(/<nowiki-\d+>/).size).to eq(2)
+    end
+  end
+  describe "cleanup" do
+    it "removes excessive newlines" do
+      result = cleanup("text\n\n\n\n\nmore")
+      expect(result.count("\n")).to be <= 4  # max 2 consecutive + trailing
+    end
+    it "removes empty parentheses" do
+      result = cleanup("text () more")
+      expect(result).not_to include("()")
+    end
+    it "removes empty Japanese parentheses" do
+      result = cleanup("text（）more")
+      expect(result).not_to include("（）")
+    end
+    it "adds trailing newlines" do
+      result = cleanup("text")
+      expect(result).to end_with("\n\n")
+    end
+    it "strips leading/trailing whitespace" do
+      result = cleanup("  text  ")
+      expect(result).to start_with("text")
+    end
+  end
+  describe "remove_html" do
+    it "removes HTML comments" do
+      result = remove_html("before <!-- comment --> after")
+      expect(result).to include("before")
+      expect(result).to include("after")
+      expect(result).not_to include("comment")
+    end
+    it "removes self-closing tags" do
+      result = remove_html("text<br/>more")
+      expect(result).to eq("textmore")
+    end
+    it "removes gallery tags" do
+      result = remove_html("<gallery>image.jpg</gallery>")
+      expect(result).not_to include("image.jpg")
+    end
+    it "handles nested div tags" do
+      result = remove_html("<div><div>inner</div></div>outside")
+      expect(result).to eq("outside")
+    end
+  end
+  describe "remove_complex" do
+    it "converts ruby annotations" do
+      # Ruby annotation: {{Ruby|漢字|かんじ}} style patterns
+      result = remove_complex("text{{Ruby|漢字|かんじ}}more")
+      # Should convert to 《》 format
+      expect(result).to include("漢字")
+    end
+  end
+  describe "remove_inbetween" do
+    it "removes content between angle brackets" do
+      result = remove_inbetween("before <tag> after")
+      expect(result).to eq("before  after")
+    end
+    it "removes multiple occurrences" do
+      result = remove_inbetween("a<1>b<2>c")
+      expect(result).to eq("abc")
+    end
+    it "uses custom tagset" do
+      result = remove_inbetween("before [content] after", ["[", "]"])
+      expect(result).to eq("before  after")
+    end
+  end
+  describe "remove_tag" do
+    it "removes HTML tags" do
+      result = remove_tag("<p>content</p>")
+      expect(result).to eq("content")
+    end
+    it "removes inline tags" do
+      result = remove_tag("<b>bold</b> and <i>italic</i>")
+      expect(result).to eq("bold and italic")
+    end
+  end
+  describe "remove_directive" do
+    it "removes behavior switches" do
+      result = remove_directive("__NOTOC__text")
+      expect(result).to eq("text")
+    end
+    it "removes TOC directive" do
+      result = remove_directive("before__TOC__after")
+      expect(result).to eq("beforeafter")
+    end
+  end
+  describe "remove_emphasis" do
+    it "removes bold markup" do
+      result = remove_emphasis("'''bold''' text")
+      expect(result).to include("bold")
+      expect(result).not_to include("'''")
+    end
+    it "removes italic markup" do
+      result = remove_emphasis("''italic'' text")
+      expect(result).to include("italic")
+      expect(result).not_to include("''")
+    end
+    it "removes bold-italic markup" do
+      result = remove_emphasis("'''''both''''' text")
+      expect(result).to include("both")
+      expect(result).not_to include("'''''")
+    end
+  end
+  describe "remove_hr" do
+    it "removes horizontal rules" do
+      result = remove_hr("before\n----\nafter")
+      expect(result).not_to include("----")
+    end
+    it "removes longer rules" do
+      result = remove_hr("text\n------\nmore")
+      expect(result).not_to include("------")
+    end
+  end
+  describe "remove_ref" do
+    # remove_ref removes [ref]...[/ref] markers (not HTML <ref> tags)
+    # Use make_reference first to convert <ref> to [ref]
+    it "removes [ref] marker tags" do
+      result = remove_ref("text[ref]citation[/ref]more")
+      expect(result).to eq("textmore")
+    end
+    it "removes multiple [ref] markers" do
+      result = remove_ref("a[ref]1[/ref]b[ref]2[/ref]c")
+      expect(result).to eq("abc")
+    end
+    it "preserves text without markers" do
+      result = remove_ref("text without references")
+      expect(result).to eq("text without references")
+    end
+  end
+  describe "make_reference" do
+    it "converts reference tags to markers" do
+      result = make_reference("text<ref>citation</ref>more")
+      expect(result).to include("[ref]")
+      expect(result).to include("[/ref]")
+    end
+    it "handles multiple references" do
+      result = make_reference("a<ref>1</ref>b<ref>2</ref>c")
+      expect(result.scan("[ref]").size).to eq(2)
+    end
+  end
+end

data/spec/utils_spec.rb CHANGED Viewed

@@ -1,19 +1,10 @@
 # frozen_string_literal: true
 require_relative "spec_helper"
-require_relative "../lib/wp2txt"
-require_relative "../lib/wp2txt/article"
-require_relative "../lib/wp2txt/utils"
-describe "Wp2txt" do
-  it "contains mediawiki-format related functions:" do
-  end
+RSpec.describe "Wp2txt Utils" do
   include Wp2txt
-  before do
-  end
   describe "process_nested_structure" do
     it "parse nested structure replacing str in the format specified" do
       str_before1 = "[[ab[[cde[[alfa]]]]fg]]"
@@ -77,9 +68,18 @@ describe "Wp2txt" do
   end
   describe "remove_hr" do
-    it "removes horizontal lines" do
-      str_before = "\n----\n--\n--\n"
-      str_after  = "\n\n"
+    it "removes horizontal lines with 4+ hyphens" do
+      # MediaWiki requires 4+ hyphens for horizontal rules
+      # The hyphens are removed but newlines around them are preserved
+      str_before = "text\n----\nmore"
+      str_after  = "text\n\nmore"
+      expect(remove_hr(str_before)).to eq str_after
+    end
+    it "does not remove lines with fewer than 4 hyphens" do
+      # Lines with fewer than 4 hyphens should be preserved
+      str_before = "text\n--\n---\nmore"
+      str_after  = "text\n--\n---\nmore"
       expect(remove_hr(str_before)).to eq str_after
     end
   end
@@ -97,11 +97,27 @@ describe "Wp2txt" do
   end
   describe "remove_directive" do
-    it "removes directive" do
-      str_before = "__abc__\n __def__"
+    it "removes MediaWiki magic words" do
+      # Use actual MediaWiki behavior switches (loaded from mediawiki_aliases.json)
+      str_before = "__NOTOC__\n __TOC__"
       str_after  = "\n "
       expect(remove_directive(str_before)).to eq str_after
     end
+    it "removes multilingual magic words" do
+      # Japanese/German/other language magic words should also be removed
+      str_before = "__KEIN_INHALTSVERZEICHNIS__\n__目次非表示__"
+      str_after  = "\n"
+      expect(remove_directive(str_before)).to eq str_after
+    end
+    it "preserves non-magic-word patterns" do
+      # Arbitrary __something__ patterns that aren't valid magic words should be preserved
+      # (This is the expected behavior with data-driven approach)
+      str_before = "__custom_marker__"
+      # With data-driven approach, unknown patterns are NOT removed
+      expect(remove_directive(str_before)).to eq str_before
+    end
   end
   describe "remove_emphasis" do
@@ -144,6 +160,55 @@ describe "Wp2txt" do
       expect(c2).to eq "b|c"
       expect(d2).to eq "[ɲ], /J/"
     end
+    it "handles pipe trick (empty display text)" do
+      # Namespace prefix removal
+      expect(process_interwiki_links("[[Wikipedia:著作権|]]")).to eq "著作権"
+      expect(process_interwiki_links("[[Help:Contents|]]")).to eq "Contents"
+      # Disambiguation suffix removal
+      expect(process_interwiki_links("[[Tokyo (disambiguation)|]]")).to eq "Tokyo"
+      expect(process_interwiki_links("[[Mercury (planet)|]]")).to eq "Mercury"
+      # Comma suffix removal
+      expect(process_interwiki_links("[[Paris, Texas|]]")).to eq "Paris"
+      expect(process_interwiki_links("[[San Francisco, California|]]")).to eq "San Francisco"
+      # Combined: namespace and disambiguation
+      expect(process_interwiki_links("[[Wikipedia:Manual of Style (dates)|]]")).to eq "Manual of Style"
+    end
+    it "handles interwiki links" do
+      expect(process_interwiki_links("[[Wikisource:日本国憲法]]")).to eq "Wikisource:日本国憲法"
+      expect(process_interwiki_links("[[s:日本国憲法|日本国憲法]]")).to eq "日本国憲法"
+    end
+  end
+  describe "apply_pipe_trick" do
+    it "removes namespace prefix" do
+      expect(apply_pipe_trick("Wikipedia:Manual of Style")).to eq "Manual of Style"
+      expect(apply_pipe_trick("Help:Contents")).to eq "Contents"
+      expect(apply_pipe_trick("カテゴリ:日本")).to eq "日本"
+    end
+    it "removes disambiguation parenthetical" do
+      expect(apply_pipe_trick("Mercury (planet)")).to eq "Mercury"
+      expect(apply_pipe_trick("東京 (曖昧さ回避)")).to eq "東京"
+    end
+    it "removes comma and following text" do
+      expect(apply_pipe_trick("Paris, Texas")).to eq "Paris"
+      expect(apply_pipe_trick("San Francisco, California")).to eq "San Francisco"
+    end
+    it "handles combined cases" do
+      expect(apply_pipe_trick("Wikipedia:Manual of Style (dates)")).to eq "Manual of Style"
+    end
+    it "returns original if no transformation needed" do
+      expect(apply_pipe_trick("Simple")).to eq "Simple"
+      expect(apply_pipe_trick("東京")).to eq "東京"
+    end
   end
   describe "process_external_links" do
@@ -162,8 +227,9 @@ describe "Wp2txt" do
   describe "correct_inline_template" do
     it "removes brackets and leaving some text" do
+      # Flag/country templates should be removed entirely
       str_before1 = "{{MedalCountry | {{JPN}} }}"
-      str_after1  = "JPN"
+      str_after1  = ""
       expect(correct_inline_template(str_before1)).to eq str_after1
       str_before2 = "{{lang|en|Japan}}"
@@ -182,5 +248,118 @@ describe "Wp2txt" do
       str_after5 = "日本人に多く見受けられる"
       expect(correct_inline_template(str_before5)).to eq str_after5
     end
+    it "removes citation templates entirely" do
+      expect(correct_inline_template("{{cite web|url=http://example.com|title=Test}}")).to eq ""
+      expect(correct_inline_template("{{cite book|title=Book|author=Author}}")).to eq ""
+      expect(correct_inline_template("{{sfn|Smith|2020|p=123}}")).to eq ""
+    end
+    it "extracts content from language templates" do
+      expect(correct_inline_template("{{lang-en|Hello}}")).to eq "Hello"
+      expect(correct_inline_template("{{langwithname|en|English|Hello World}}")).to eq "Hello World"
+      expect(correct_inline_template("{{IPA|/həˈloʊ/}}")).to eq "/həˈloʊ/"
+    end
+    it "formats nihongo template correctly" do
+      expect(correct_inline_template("{{nihongo|Tokyo|東京|Tōkyō}}")).to eq "Tokyo (東京, Tōkyō)"
+      expect(correct_inline_template("{{nihongo|Tokyo|東京}}")).to eq "Tokyo (東京)"
+    end
+    it "handles convert template" do
+      expect(correct_inline_template("{{convert|100|km|mi}}")).to eq "100 km"
+    end
+    it "removes flag templates" do
+      expect(correct_inline_template("{{flagicon|Japan}}")).to eq ""
+      expect(correct_inline_template("{{JPN}}")).to eq ""
+      expect(correct_inline_template("{{USA}}")).to eq ""
+    end
+  end
+  describe "parse_markers_config" do
+    it "returns default markers for true" do
+      result = parse_markers_config(true)
+      expect(result).to be_an(Array)
+      expect(result).not_to be_empty
+    end
+    it "returns empty array for false" do
+      result = parse_markers_config(false)
+      expect(result).to eq([])
+    end
+    it "filters array to valid marker types" do
+      result = parse_markers_config([:math, :code, :invalid_type])
+      expect(result).to include(:math)
+      expect(result).to include(:code)
+      expect(result).not_to include(:invalid_type)
+    end
+    it "returns default markers for unexpected input" do
+      result = parse_markers_config("unexpected string")
+      expect(result).to be_an(Array)
+      expect(result).not_to be_empty
+    end
+    it "returns default markers for nil" do
+      result = parse_markers_config(nil)
+      expect(result).to be_an(Array)
+    end
+  end
+  describe "process_interwiki_links" do
+    it "removes category links" do
+      result = process_interwiki_links("[[Category:Test]]")
+      expect(result).to eq("")
+    end
+    it "removes category links in Japanese" do
+      result = process_interwiki_links("[[カテゴリ:テスト]]")
+      expect(result).to eq("")
+    end
+    it "extracts caption from file links" do
+      result = process_interwiki_links("[[File:Image.jpg|thumb|200px|A caption]]")
+      expect(result).to include("caption")
+    end
+    it "handles file links without caption" do
+      result = process_interwiki_links("[[File:Image.jpg]]")
+      expect(result).to eq("")
+    end
+    it "handles pipe trick" do
+      result = process_interwiki_links("[[Tokyo (city)|]]")
+      expect(result).to eq("Tokyo")
+    end
+    it "handles simple links" do
+      result = process_interwiki_links("[[Simple Link]]")
+      expect(result).to eq("Simple Link")
+    end
+    it "handles links with display text" do
+      result = process_interwiki_links("[[Target|Display Text]]")
+      expect(result).to eq("Display Text")
+    end
+  end
+  describe "marker_placeholder" do
+    it "creates placeholder with marker type" do
+      result = marker_placeholder(:math)
+      expect(result).to include("MATH")
+      expect(result).to include("««")
+      expect(result).to include("»»")
+    end
+  end
+  describe "finalize_markers" do
+    it "converts placeholders to final format" do
+      placeholder = marker_placeholder(:math)
+      result = finalize_markers("text #{placeholder} more")
+      expect(result).to include("[MATH]")
+      expect(result).not_to include("««")
+    end
   end
 end