wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
@@ -0,0 +1,382 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "spec_helper"
4
+ require "tmpdir"
5
+ require "fileutils"
6
+ require_relative "../lib/wp2txt/article"
7
+ require_relative "../lib/wp2txt/formatter"
8
+
9
+ RSpec.describe "Formatter section extraction" do
10
+ include Wp2txt::Formatter
11
+ include Wp2txt
12
+
13
+ let(:sample_wiki_text) do
14
+ <<~WIKI
15
+ '''The Godfather''' is a 1972 American crime film.
16
+
17
+ == Plot ==
18
+ The story of the Corleone crime family.
19
+
20
+ == Cast ==
21
+ * Marlon Brando as Vito Corleone
22
+ * Al Pacino as Michael Corleone
23
+
24
+ == Reception ==
25
+ The film received critical acclaim.
26
+
27
+ === Awards ===
28
+ Won three Academy Awards.
29
+
30
+ [[Category:1972 films]]
31
+ [[Category:Crime films]]
32
+ WIKI
33
+ end
34
+
35
+ let(:article) { Wp2txt::Article.new(sample_wiki_text, "The Godfather") }
36
+
37
+ describe "format_article with sections" do
38
+ context "structured JSON output" do
39
+ let(:config) do
40
+ {
41
+ format: :json,
42
+ sections: ["summary", "Plot", "Reception"],
43
+ category: true
44
+ }
45
+ end
46
+
47
+ it "returns sections object with each section" do
48
+ result = format_article(article, config)
49
+ expect(result).to be_a(Hash)
50
+ expect(result["sections"]).to be_a(Hash)
51
+ expect(result["sections"].keys).to include("summary", "Plot", "Reception")
52
+ end
53
+
54
+ it "includes summary text" do
55
+ result = format_article(article, config)
56
+ expect(result["sections"]["summary"]).to include("1972 American crime film")
57
+ end
58
+
59
+ it "includes section content" do
60
+ result = format_article(article, config)
61
+ expect(result["sections"]["Plot"]).to include("Corleone crime family")
62
+ end
63
+
64
+ it "includes subsections in parent section" do
65
+ result = format_article(article, config)
66
+ expect(result["sections"]["Reception"]).to include("Academy Awards")
67
+ end
68
+
69
+ it "includes categories" do
70
+ result = format_article(article, config)
71
+ expect(result["categories"]).to include("1972 films", "Crime films")
72
+ end
73
+ end
74
+
75
+ context "combined JSON output" do
76
+ let(:config) do
77
+ {
78
+ format: :json,
79
+ sections: ["summary", "Plot"],
80
+ section_output: "combined",
81
+ category: true
82
+ }
83
+ end
84
+
85
+ it "returns concatenated text" do
86
+ result = format_article(article, config)
87
+ expect(result["text"]).to include("crime film")
88
+ expect(result["text"]).to include("Corleone")
89
+ end
90
+
91
+ it "includes sections_included array" do
92
+ result = format_article(article, config)
93
+ expect(result["sections_included"]).to eq(["summary", "Plot"])
94
+ end
95
+ end
96
+
97
+ context "structured text output" do
98
+ let(:config) do
99
+ {
100
+ format: :text,
101
+ sections: ["summary", "Plot", "Cast"],
102
+ category: true
103
+ }
104
+ end
105
+
106
+ it "includes TITLE header" do
107
+ result = format_article(article, config)
108
+ expect(result).to include("TITLE: The Godfather")
109
+ end
110
+
111
+ it "includes SECTION labels" do
112
+ result = format_article(article, config)
113
+ expect(result).to include("SECTION [summary]:")
114
+ expect(result).to include("SECTION [Plot]:")
115
+ expect(result).to include("SECTION [Cast]:")
116
+ end
117
+
118
+ it "includes CATEGORIES footer" do
119
+ result = format_article(article, config)
120
+ expect(result).to include("CATEGORIES: 1972 films, Crime films")
121
+ end
122
+ end
123
+
124
+ context "combined text output" do
125
+ let(:config) do
126
+ {
127
+ format: :text,
128
+ sections: ["summary", "Plot"],
129
+ section_output: "combined",
130
+ category: true
131
+ }
132
+ end
133
+
134
+ it "includes SECTIONS header listing included sections" do
135
+ result = format_article(article, config)
136
+ expect(result).to include("SECTIONS: summary, Plot")
137
+ end
138
+
139
+ it "includes concatenated content" do
140
+ result = format_article(article, config)
141
+ expect(result).to include("crime film")
142
+ expect(result).to include("Corleone")
143
+ end
144
+ end
145
+
146
+ context "with non-existent sections" do
147
+ let(:config) do
148
+ {
149
+ format: :json,
150
+ sections: ["summary", "Gameplay", "Plot"],
151
+ category: true
152
+ }
153
+ end
154
+
155
+ it "returns nil for non-existent sections" do
156
+ result = format_article(article, config)
157
+ expect(result["sections"]["Gameplay"]).to be_nil
158
+ expect(result["sections"]["Plot"]).not_to be_nil
159
+ end
160
+ end
161
+
162
+ context "with min_section_length filter" do
163
+ let(:config) do
164
+ {
165
+ format: :json,
166
+ sections: ["summary", "Plot"],
167
+ min_section_length: 100,
168
+ category: true
169
+ }
170
+ end
171
+
172
+ it "filters out short sections" do
173
+ result = format_article(article, config)
174
+ # Summary is short in this test
175
+ expect(result["sections"]["summary"]).to be_nil
176
+ end
177
+ end
178
+
179
+ context "with skip_empty option" do
180
+ let(:no_match_config) do
181
+ {
182
+ format: :json,
183
+ sections: ["Gameplay", "Soundtrack"],
184
+ skip_empty: true,
185
+ category: true
186
+ }
187
+ end
188
+
189
+ it "returns nil for articles with no matching sections" do
190
+ result = format_article(article, no_match_config)
191
+ expect(result).to be_nil
192
+ end
193
+ end
194
+ end
195
+
196
+ describe "summary_only refactoring" do
197
+ let(:config) do
198
+ {
199
+ format: :json,
200
+ summary_only: true,
201
+ category: true
202
+ }
203
+ end
204
+
205
+ it "extracts only summary" do
206
+ result = format_article(article, config)
207
+ expect(result["text"]).to include("crime film")
208
+ expect(result["text"]).not_to include("Corleone")
209
+ end
210
+
211
+ it "uses combined output mode" do
212
+ result = format_article(article, config)
213
+ expect(result["sections_included"]).to eq(["summary"])
214
+ end
215
+ end
216
+
217
+ describe "alias matching in extraction" do
218
+ let(:wiki_with_synopsis) do
219
+ <<~WIKI
220
+ A movie summary.
221
+
222
+ == Synopsis ==
223
+ The story follows the main character.
224
+
225
+ [[Category:Films]]
226
+ WIKI
227
+ end
228
+
229
+ let(:synopsis_article) { Wp2txt::Article.new(wiki_with_synopsis, "Test Movie") }
230
+
231
+ let(:config) do
232
+ {
233
+ format: :json,
234
+ sections: ["summary", "Plot"],
235
+ category: true
236
+ }
237
+ end
238
+
239
+ it "matches Synopsis as alias for Plot" do
240
+ result = format_article(synopsis_article, config)
241
+ expect(result["sections"]["Plot"]).to include("main character")
242
+ end
243
+ end
244
+
245
+ describe "show_matched_sections option" do
246
+ let(:wiki_with_synopsis) do
247
+ <<~WIKI
248
+ A movie summary.
249
+
250
+ == Synopsis ==
251
+ The story follows the main character.
252
+
253
+ [[Category:Films]]
254
+ WIKI
255
+ end
256
+
257
+ let(:synopsis_article) { Wp2txt::Article.new(wiki_with_synopsis, "Test Movie") }
258
+
259
+ context "when enabled" do
260
+ let(:config) do
261
+ {
262
+ format: :json,
263
+ sections: ["summary", "Plot"],
264
+ show_matched_sections: true,
265
+ category: true
266
+ }
267
+ end
268
+
269
+ it "includes matched_sections field" do
270
+ result = format_article(synopsis_article, config)
271
+ expect(result["matched_sections"]).to be_a(Hash)
272
+ expect(result["matched_sections"]["Plot"]).to eq("Synopsis")
273
+ end
274
+ end
275
+
276
+ context "when disabled (default)" do
277
+ let(:config) do
278
+ {
279
+ format: :json,
280
+ sections: ["summary", "Plot"],
281
+ show_matched_sections: false,
282
+ category: true
283
+ }
284
+ end
285
+
286
+ it "does not include matched_sections field" do
287
+ result = format_article(synopsis_article, config)
288
+ expect(result).not_to have_key("matched_sections")
289
+ end
290
+ end
291
+
292
+ context "with combined output mode" do
293
+ let(:config) do
294
+ {
295
+ format: :json,
296
+ sections: ["summary", "Plot"],
297
+ section_output: "combined",
298
+ show_matched_sections: true,
299
+ category: true
300
+ }
301
+ end
302
+
303
+ it "includes matched_sections in combined output" do
304
+ result = format_article(synopsis_article, config)
305
+ expect(result["matched_sections"]["Plot"]).to eq("Synopsis")
306
+ end
307
+ end
308
+ end
309
+
310
+ describe "no_section_aliases option" do
311
+ let(:wiki_with_synopsis) do
312
+ <<~WIKI
313
+ A movie summary.
314
+
315
+ == Synopsis ==
316
+ The story follows the main character.
317
+
318
+ [[Category:Films]]
319
+ WIKI
320
+ end
321
+
322
+ let(:synopsis_article) { Wp2txt::Article.new(wiki_with_synopsis, "Test Movie") }
323
+
324
+ context "when aliases are disabled" do
325
+ let(:config) do
326
+ {
327
+ format: :json,
328
+ sections: ["summary", "Plot"],
329
+ no_section_aliases: true,
330
+ category: true
331
+ }
332
+ end
333
+
334
+ it "does not match Synopsis as Plot" do
335
+ result = format_article(synopsis_article, config)
336
+ expect(result["sections"]["Plot"]).to be_nil
337
+ end
338
+ end
339
+ end
340
+
341
+ describe "alias_file option" do
342
+ let(:temp_dir) { Dir.mktmpdir }
343
+ let(:alias_file) { File.join(temp_dir, "custom_aliases.yml") }
344
+
345
+ after { FileUtils.remove_entry(temp_dir) }
346
+
347
+ let(:wiki_with_story) do
348
+ <<~WIKI
349
+ A summary.
350
+
351
+ == Storyline ==
352
+ The narrative unfolds.
353
+
354
+ [[Category:Films]]
355
+ WIKI
356
+ end
357
+
358
+ let(:story_article) { Wp2txt::Article.new(wiki_with_story, "Story Film") }
359
+
360
+ before do
361
+ File.write(alias_file, <<~YAML)
362
+ Plot:
363
+ - Storyline
364
+ - Narrative
365
+ YAML
366
+ end
367
+
368
+ let(:config) do
369
+ {
370
+ format: :json,
371
+ sections: ["Plot"],
372
+ alias_file: alias_file,
373
+ category: true
374
+ }
375
+ end
376
+
377
+ it "uses custom aliases from file" do
378
+ result = format_article(story_article, config)
379
+ expect(result["sections"]["Plot"]).to include("narrative unfolds")
380
+ end
381
+ end
382
+ end
@@ -0,0 +1,186 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "spec_helper"
4
+ require "tmpdir"
5
+ require "fileutils"
6
+
7
+ RSpec.describe Wp2txt::GlobalDataCache do
8
+ let(:cache_dir) { Dir.mktmpdir("wp2txt_global_cache_test_") }
9
+
10
+ before do
11
+ described_class.configure(cache_dir: cache_dir, enabled: true)
12
+ described_class.clear!
13
+ end
14
+
15
+ after do
16
+ described_class.clear!
17
+ FileUtils.rm_rf(cache_dir)
18
+ end
19
+
20
+ describe ".configure" do
21
+ it "sets cache directory" do
22
+ described_class.configure(cache_dir: "/tmp/custom")
23
+ expect(described_class.cache_path).to eq "/tmp/custom/global_data.sqlite3"
24
+ end
25
+
26
+ it "can disable caching" do
27
+ described_class.configure(enabled: false)
28
+ expect(described_class.enabled).to be false
29
+ end
30
+ end
31
+
32
+ describe ".cache_path" do
33
+ it "returns path to SQLite database" do
34
+ expect(described_class.cache_path).to end_with("global_data.sqlite3")
35
+ end
36
+ end
37
+
38
+ describe ".save and .load" do
39
+ it "saves and loads data" do
40
+ test_data = { "key1" => "value1", "nested" => { "a" => 1 } }
41
+ described_class.save(:test_category, test_data)
42
+
43
+ loaded = described_class.load(:test_category)
44
+ expect(loaded).to eq test_data
45
+ end
46
+
47
+ it "returns nil for non-existent category" do
48
+ expect(described_class.load(:nonexistent)).to be_nil
49
+ end
50
+
51
+ it "handles empty hash" do
52
+ described_class.save(:empty, {})
53
+ expect(described_class.load(:empty)).to eq({})
54
+ end
55
+
56
+ it "handles arrays in data" do
57
+ test_data = { "list" => [1, 2, 3], "strings" => %w[a b c] }
58
+ described_class.save(:with_arrays, test_data)
59
+
60
+ loaded = described_class.load(:with_arrays)
61
+ expect(loaded["list"]).to eq [1, 2, 3]
62
+ expect(loaded["strings"]).to eq %w[a b c]
63
+ end
64
+ end
65
+
66
+ describe ".load_all" do
67
+ it "loads all cached categories" do
68
+ described_class.save(:cat1, { "a" => 1 })
69
+ described_class.save(:cat2, { "b" => 2 })
70
+
71
+ all = described_class.load_all
72
+ expect(all[:cat1]).to eq({ "a" => 1 })
73
+ expect(all[:cat2]).to eq({ "b" => 2 })
74
+ end
75
+
76
+ it "returns empty hash when cache is empty" do
77
+ expect(described_class.load_all).to eq({})
78
+ end
79
+ end
80
+
81
+ describe ".save_all" do
82
+ it "saves multiple categories at once" do
83
+ data = {
84
+ cat1: { "x" => 1 },
85
+ cat2: { "y" => 2 }
86
+ }
87
+ described_class.save_all(data)
88
+
89
+ expect(described_class.load(:cat1)).to eq({ "x" => 1 })
90
+ expect(described_class.load(:cat2)).to eq({ "y" => 2 })
91
+ end
92
+ end
93
+
94
+ describe ".clear!" do
95
+ it "removes the cache file" do
96
+ described_class.save(:test, { "data" => true })
97
+ expect(File.exist?(described_class.cache_path)).to be true
98
+
99
+ described_class.clear!
100
+ expect(File.exist?(described_class.cache_path)).to be false
101
+ end
102
+ end
103
+
104
+ describe ".stats" do
105
+ it "returns cache statistics" do
106
+ described_class.save(:test, { "data" => "value" })
107
+
108
+ stats = described_class.stats
109
+ expect(stats[:cache_path]).to eq described_class.cache_path
110
+ expect(stats[:cache_size]).to be > 0
111
+ expect(stats[:categories]).to be_an(Array)
112
+ expect(stats[:categories].first[:category]).to eq "test"
113
+ end
114
+
115
+ it "returns nil when cache doesn't exist" do
116
+ described_class.clear!
117
+ expect(described_class.stats).to be_nil
118
+ end
119
+ end
120
+
121
+ describe "caching disabled" do
122
+ before do
123
+ described_class.configure(cache_dir: cache_dir, enabled: false)
124
+ end
125
+
126
+ it "does not save data when disabled" do
127
+ described_class.save(:test, { "data" => true })
128
+ expect(File.exist?(described_class.cache_path)).to be false
129
+ end
130
+
131
+ it "returns nil when loading with cache disabled" do
132
+ # Enable temporarily to save
133
+ described_class.configure(cache_dir: cache_dir, enabled: true)
134
+ described_class.save(:test, { "data" => true })
135
+
136
+ # Disable and try to load
137
+ described_class.configure(cache_dir: cache_dir, enabled: false)
138
+ expect(described_class.load(:test)).to be_nil
139
+ end
140
+ end
141
+
142
+ describe "integration with real data files" do
143
+ before do
144
+ described_class.configure(cache_dir: cache_dir, enabled: true)
145
+ described_class.clear!
146
+ # Clear cached instance variables
147
+ Wp2txt.instance_variable_set(:@mediawiki_data, nil)
148
+ Wp2txt.instance_variable_set(:@template_data, nil)
149
+ Wp2txt.instance_variable_set(:@html_entities, nil)
150
+ end
151
+
152
+ it "caches mediawiki data" do
153
+ # First load - from JSON
154
+ data1 = Wp2txt.load_mediawiki_data
155
+ expect(data1).to be_a(Hash)
156
+ expect(data1).to have_key("magic_words")
157
+
158
+ # Clear instance variable to force reload
159
+ Wp2txt.instance_variable_set(:@mediawiki_data, nil)
160
+
161
+ # Second load - from cache
162
+ data2 = Wp2txt.load_mediawiki_data
163
+ expect(data2).to eq data1
164
+ end
165
+
166
+ it "caches template data" do
167
+ data1 = Wp2txt.load_template_data
168
+ expect(data1).to be_a(Hash)
169
+
170
+ Wp2txt.instance_variable_set(:@template_data, nil)
171
+
172
+ data2 = Wp2txt.load_template_data
173
+ expect(data2).to eq data1
174
+ end
175
+
176
+ it "caches html entities" do
177
+ data1 = Wp2txt.load_html_entities
178
+ expect(data1).to be_a(Hash)
179
+
180
+ Wp2txt.instance_variable_set(:@html_entities, nil)
181
+
182
+ data2 = Wp2txt.load_html_entities
183
+ expect(data2).to eq data1
184
+ end
185
+ end
186
+ end