wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
@@ -0,0 +1,226 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "spec_helper"
4
+ require "tmpdir"
5
+ require "fileutils"
6
+
7
+ RSpec.describe Wp2txt::CategoryCache do
8
+ let(:cache_dir) { Dir.mktmpdir("wp2txt_category_cache_test_") }
9
+ let(:cache) { described_class.new("en", cache_dir: cache_dir) }
10
+
11
+ after do
12
+ cache.close
13
+ FileUtils.rm_rf(cache_dir)
14
+ end
15
+
16
+ describe "#initialize" do
17
+ it "creates cache file" do
18
+ expect(File.exist?(cache.cache_path)).to be true
19
+ end
20
+
21
+ it "sets language" do
22
+ expect(cache.lang).to eq "en"
23
+ end
24
+
25
+ it "uses default expiry days" do
26
+ expect(cache.expiry_days).to eq Wp2txt::DEFAULT_CATEGORY_CACHE_EXPIRY_DAYS
27
+ end
28
+
29
+ it "accepts custom expiry days" do
30
+ custom_cache = described_class.new("ja", cache_dir: cache_dir, expiry_days: 14)
31
+ expect(custom_cache.expiry_days).to eq 14
32
+ custom_cache.close
33
+ end
34
+ end
35
+
36
+ describe "#save and #get" do
37
+ it "saves and retrieves category data" do
38
+ pages = ["Article 1", "Article 2", "Article 3"]
39
+ subcats = ["Subcategory A", "Subcategory B"]
40
+
41
+ cache.save("Test Category", pages, subcats)
42
+ data = cache.get("Test Category")
43
+
44
+ expect(data[:pages]).to eq pages
45
+ expect(data[:subcats]).to eq subcats
46
+ end
47
+
48
+ it "handles empty pages" do
49
+ cache.save("Empty Pages", [], ["Subcat"])
50
+ data = cache.get("Empty Pages")
51
+
52
+ expect(data[:pages]).to eq []
53
+ expect(data[:subcats]).to eq ["Subcat"]
54
+ end
55
+
56
+ it "handles empty subcategories" do
57
+ cache.save("No Subcats", ["Article"], [])
58
+ data = cache.get("No Subcats")
59
+
60
+ expect(data[:pages]).to eq ["Article"]
61
+ expect(data[:subcats]).to eq []
62
+ end
63
+
64
+ it "handles Unicode category names" do
65
+ cache.save("日本の都市", ["東京", "大阪"], ["関東の都市"])
66
+ data = cache.get("日本の都市")
67
+
68
+ expect(data[:pages]).to contain_exactly("東京", "大阪")
69
+ expect(data[:subcats]).to contain_exactly("関東の都市")
70
+ end
71
+
72
+ it "normalizes category name by removing Category: prefix" do
73
+ cache.save("Category:Test", ["Article"], [])
74
+ data = cache.get("Test")
75
+
76
+ expect(data[:pages]).to eq ["Article"]
77
+ end
78
+
79
+ it "returns nil for non-existent category" do
80
+ expect(cache.get("Nonexistent")).to be_nil
81
+ end
82
+ end
83
+
84
+ describe "#cached?" do
85
+ it "returns true for cached category" do
86
+ cache.save("Cached", ["Article"], [])
87
+ expect(cache.cached?("Cached")).to be true
88
+ end
89
+
90
+ it "returns false for non-cached category" do
91
+ expect(cache.cached?("Not Cached")).to be false
92
+ end
93
+ end
94
+
95
+ describe "#get_all_pages" do
96
+ before do
97
+ cache.save("Root", ["Article1", "Article2"], ["Child1", "Child2"])
98
+ cache.save("Child1", ["Article3"], ["Grandchild"])
99
+ cache.save("Child2", ["Article4", "Article5"], [])
100
+ cache.save("Grandchild", ["Article6"], [])
101
+ end
102
+
103
+ it "returns only direct pages with max_depth 0" do
104
+ pages = cache.get_all_pages("Root", max_depth: 0)
105
+ expect(pages).to contain_exactly("Article1", "Article2")
106
+ end
107
+
108
+ it "includes subcategory pages with max_depth 1" do
109
+ pages = cache.get_all_pages("Root", max_depth: 1)
110
+ expect(pages).to contain_exactly("Article1", "Article2", "Article3", "Article4", "Article5")
111
+ end
112
+
113
+ it "includes all nested pages with sufficient depth" do
114
+ pages = cache.get_all_pages("Root", max_depth: 2)
115
+ expect(pages).to contain_exactly("Article1", "Article2", "Article3", "Article4", "Article5", "Article6")
116
+ end
117
+
118
+ it "returns unique pages (no duplicates)" do
119
+ cache.save("DupParent", ["SharedArticle"], ["DupChild"])
120
+ cache.save("DupChild", ["SharedArticle", "UniqueArticle"], [])
121
+
122
+ pages = cache.get_all_pages("DupParent", max_depth: 1)
123
+ expect(pages.count("SharedArticle")).to eq 1
124
+ end
125
+
126
+ it "handles circular references" do
127
+ cache.save("CircularA", ["A1"], ["CircularB"])
128
+ cache.save("CircularB", ["B1"], ["CircularA"])
129
+
130
+ pages = cache.get_all_pages("CircularA", max_depth: 10)
131
+ expect(pages).to contain_exactly("A1", "B1")
132
+ end
133
+
134
+ it "returns empty array for non-existent category" do
135
+ expect(cache.get_all_pages("Nonexistent")).to eq []
136
+ end
137
+ end
138
+
139
+ describe "#get_tree" do
140
+ before do
141
+ cache.save("Root", ["A1"], ["Child"])
142
+ cache.save("Child", ["C1", "C2"], [])
143
+ end
144
+
145
+ it "returns tree structure" do
146
+ tree = cache.get_tree("Root", max_depth: 1)
147
+
148
+ expect(tree[:name]).to eq "Root"
149
+ expect(tree[:cached]).to be true
150
+ expect(tree[:page_count]).to eq 1
151
+ expect(tree[:children].size).to eq 1
152
+ expect(tree[:children].first[:name]).to eq "Child"
153
+ end
154
+
155
+ it "limits depth" do
156
+ cache.save("Deep", ["D1"], ["Root"])
157
+
158
+ tree = cache.get_tree("Deep", max_depth: 0)
159
+ expect(tree[:children]).to be_empty
160
+ end
161
+ end
162
+
163
+ describe "#stats" do
164
+ it "returns cache statistics" do
165
+ cache.save("Cat1", ["A1", "A2"], ["Sub1"])
166
+ cache.save("Cat2", ["A3"], [])
167
+
168
+ stats = cache.stats
169
+
170
+ expect(stats[:lang]).to eq "en"
171
+ expect(stats[:total_categories]).to eq 2
172
+ expect(stats[:total_pages]).to eq 3
173
+ expect(stats[:total_relations]).to eq 1
174
+ expect(stats[:cache_size]).to be > 0
175
+ end
176
+ end
177
+
178
+ describe "#clear!" do
179
+ it "removes all cached data" do
180
+ cache.save("Test", ["Article"], [])
181
+ cache.clear!
182
+
183
+ expect(cache.cached?("Test")).to be false
184
+ end
185
+ end
186
+
187
+ describe "#cleanup_expired!" do
188
+ it "removes expired entries" do
189
+ # Save a category
190
+ cache.save("Old", ["Article"], [])
191
+
192
+ # Manually update the cached_at to make it old
193
+ # We need to access the database directly for this test
194
+ cache.instance_variable_get(:@db).execute(
195
+ "UPDATE categories SET cached_at = ? WHERE name = ?",
196
+ [Time.now.to_i - (30 * 24 * 3600), "Old"] # 30 days ago
197
+ )
198
+
199
+ # Save a fresh category
200
+ cache.save("Fresh", ["NewArticle"], [])
201
+
202
+ # Cleanup with default 7-day expiry
203
+ removed = cache.cleanup_expired!
204
+
205
+ expect(removed).to eq 1
206
+ expect(cache.cached?("Old")).to be false
207
+ expect(cache.cached?("Fresh")).to be true
208
+ end
209
+ end
210
+
211
+ describe "per-language isolation" do
212
+ it "creates separate cache per language" do
213
+ en_cache = described_class.new("en", cache_dir: cache_dir)
214
+ ja_cache = described_class.new("ja", cache_dir: cache_dir)
215
+
216
+ en_cache.save("Cities", ["New York"], [])
217
+ ja_cache.save("Cities", ["東京"], [])
218
+
219
+ expect(en_cache.get("Cities")[:pages]).to eq ["New York"]
220
+ expect(ja_cache.get("Cities")[:pages]).to eq ["東京"]
221
+
222
+ en_cache.close
223
+ ja_cache.close
224
+ end
225
+ end
226
+ end