wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
@@ -0,0 +1,210 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "spec_helper"
4
+ require "tmpdir"
5
+ require "fileutils"
6
+
7
+ RSpec.describe Wp2txt::IndexCache do
8
+ let(:cache_dir) { Dir.mktmpdir("wp2txt_index_cache_test_") }
9
+ let(:source_file) { File.join(cache_dir, "test-index.txt") }
10
+ let(:cache) { described_class.new(source_file, cache_dir: cache_dir) }
11
+
12
+ before do
13
+ # Create a dummy source file
14
+ File.write(source_file, "test content")
15
+ end
16
+
17
+ after do
18
+ FileUtils.rm_rf(cache_dir)
19
+ end
20
+
21
+ describe "#initialize" do
22
+ it "builds cache path from source file" do
23
+ expect(cache.cache_path).to include("test")
24
+ expect(cache.cache_path).to end_with(".sqlite3")
25
+ end
26
+
27
+ it "stores source path" do
28
+ expect(cache.source_path).to eq source_file
29
+ end
30
+ end
31
+
32
+ describe "#valid?" do
33
+ it "returns false when cache does not exist" do
34
+ expect(cache.valid?).to be false
35
+ end
36
+
37
+ it "returns false when source file does not exist" do
38
+ FileUtils.rm_f(source_file)
39
+ expect(cache.valid?).to be false
40
+ end
41
+
42
+ context "with saved cache" do
43
+ before do
44
+ entries = {
45
+ "Article 1" => { offset: 1000, page_id: 1, title: "Article 1" },
46
+ "Article 2" => { offset: 2000, page_id: 2, title: "Article 2" }
47
+ }
48
+ cache.save(entries, [0, 1000, 2000])
49
+ end
50
+
51
+ it "returns true for valid cache" do
52
+ expect(cache.valid?).to be true
53
+ end
54
+
55
+ it "returns false when source file changes" do
56
+ # Modify source file
57
+ sleep 0.1 # Ensure mtime changes
58
+ File.write(source_file, "modified content that is longer")
59
+ expect(cache.valid?).to be false
60
+ end
61
+ end
62
+ end
63
+
64
+ describe "#save and #load" do
65
+ let(:entries) do
66
+ {
67
+ "Article A" => { offset: 100, page_id: 1, title: "Article A" },
68
+ "Article B" => { offset: 200, page_id: 2, title: "Article B" },
69
+ "Article C" => { offset: 300, page_id: 3, title: "Article C" }
70
+ }
71
+ end
72
+ let(:stream_offsets) { [0, 100, 200, 300] }
73
+
74
+ it "saves and loads entries" do
75
+ cache.save(entries, stream_offsets)
76
+
77
+ loaded = cache.load
78
+ expect(loaded[:entries_by_title].size).to eq 3
79
+ expect(loaded[:entries_by_title]["Article A"][:offset]).to eq 100
80
+ expect(loaded[:entries_by_title]["Article B"][:page_id]).to eq 2
81
+ end
82
+
83
+ it "saves and loads stream offsets" do
84
+ cache.save(entries, stream_offsets)
85
+
86
+ loaded = cache.load
87
+ expect(loaded[:stream_offsets]).to eq stream_offsets
88
+ end
89
+
90
+ it "loads entries by ID" do
91
+ cache.save(entries, stream_offsets)
92
+
93
+ loaded = cache.load
94
+ expect(loaded[:entries_by_id][1][:title]).to eq "Article A"
95
+ expect(loaded[:entries_by_id][2][:title]).to eq "Article B"
96
+ end
97
+
98
+ it "returns nil when cache is invalid" do
99
+ expect(cache.load).to be_nil
100
+ end
101
+
102
+ it "handles large number of entries" do
103
+ large_entries = {}
104
+ 10_000.times do |i|
105
+ large_entries["Article #{i}"] = { offset: i * 1000, page_id: i, title: "Article #{i}" }
106
+ end
107
+
108
+ cache.save(large_entries, [0])
109
+ loaded = cache.load
110
+ expect(loaded[:entries_by_title].size).to eq 10_000
111
+ end
112
+
113
+ it "handles Unicode titles" do
114
+ unicode_entries = {
115
+ "東京" => { offset: 100, page_id: 1, title: "東京" },
116
+ "Москва" => { offset: 200, page_id: 2, title: "Москва" },
117
+ "القاهرة" => { offset: 300, page_id: 3, title: "القاهرة" }
118
+ }
119
+
120
+ cache.save(unicode_entries, [0])
121
+ loaded = cache.load
122
+ expect(loaded[:entries_by_title]["東京"][:offset]).to eq 100
123
+ expect(loaded[:entries_by_title]["Москва"][:offset]).to eq 200
124
+ end
125
+ end
126
+
127
+ describe "#find_by_titles" do
128
+ before do
129
+ entries = {
130
+ "Article 1" => { offset: 100, page_id: 1, title: "Article 1" },
131
+ "Article 2" => { offset: 200, page_id: 2, title: "Article 2" },
132
+ "Article 3" => { offset: 300, page_id: 3, title: "Article 3" }
133
+ }
134
+ cache.save(entries, [0])
135
+ end
136
+
137
+ it "finds existing titles" do
138
+ results = cache.find_by_titles(["Article 1", "Article 3"])
139
+ expect(results.size).to eq 2
140
+ expect(results["Article 1"][:offset]).to eq 100
141
+ expect(results["Article 3"][:offset]).to eq 300
142
+ end
143
+
144
+ it "ignores non-existent titles" do
145
+ results = cache.find_by_titles(["Article 1", "Nonexistent"])
146
+ expect(results.size).to eq 1
147
+ expect(results).to have_key("Article 1")
148
+ expect(results).not_to have_key("Nonexistent")
149
+ end
150
+
151
+ it "returns empty hash for empty input" do
152
+ expect(cache.find_by_titles([])).to eq({})
153
+ end
154
+
155
+ it "returns empty hash when cache is invalid" do
156
+ cache.clear!
157
+ expect(cache.find_by_titles(["Article 1"])).to eq({})
158
+ end
159
+ end
160
+
161
+ describe "#stats" do
162
+ it "returns cache statistics" do
163
+ entries = {
164
+ "Article 1" => { offset: 100, page_id: 1, title: "Article 1" },
165
+ "Article 2" => { offset: 200, page_id: 2, title: "Article 2" }
166
+ }
167
+ cache.save(entries, [0, 100, 200])
168
+
169
+ stats = cache.stats
170
+ expect(stats[:cache_path]).to eq cache.cache_path
171
+ expect(stats[:entry_count]).to eq 2
172
+ expect(stats[:stream_count]).to eq 3
173
+ expect(stats[:cache_size]).to be > 0
174
+ end
175
+
176
+ it "returns nil when cache does not exist" do
177
+ expect(cache.stats).to be_nil
178
+ end
179
+ end
180
+
181
+ describe "#clear!" do
182
+ it "removes cache file" do
183
+ entries = { "Test" => { offset: 100, page_id: 1, title: "Test" } }
184
+ cache.save(entries, [0])
185
+
186
+ expect(File.exist?(cache.cache_path)).to be true
187
+ cache.clear!
188
+ expect(File.exist?(cache.cache_path)).to be false
189
+ end
190
+ end
191
+
192
+ describe "concurrent access" do
193
+ it "handles multiple readers" do
194
+ entries = { "Test" => { offset: 100, page_id: 1, title: "Test" } }
195
+ cache.save(entries, [0])
196
+
197
+ # Simulate multiple readers
198
+ results = 3.times.map do
199
+ Thread.new do
200
+ c = described_class.new(source_file, cache_dir: cache_dir)
201
+ c.load
202
+ end
203
+ end.map(&:value)
204
+
205
+ results.each do |result|
206
+ expect(result[:entries_by_title]).to have_key("Test")
207
+ end
208
+ end
209
+ end
210
+ end