wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
@@ -0,0 +1,281 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "spec_helper"
4
+
5
+ RSpec.describe "Wp2txt MediaWiki Data Loading" do
6
+ describe "load_mediawiki_data" do
7
+ it "loads data from JSON file" do
8
+ data = Wp2txt.load_mediawiki_data
9
+ expect(data).to be_a(Hash)
10
+ expect(data).to have_key("magic_words")
11
+ expect(data).to have_key("namespaces")
12
+ end
13
+
14
+ it "contains redirect keywords" do
15
+ data = Wp2txt.load_mediawiki_data
16
+ redirects = data.dig("magic_words", "redirect")
17
+ expect(redirects).to be_an(Array)
18
+ expect(redirects).to include("REDIRECT")
19
+ expect(redirects.size).to be > 100 # Should have many languages
20
+ end
21
+
22
+ it "contains category namespaces" do
23
+ data = Wp2txt.load_mediawiki_data
24
+ categories = data.dig("namespaces", "category")
25
+ expect(categories).to be_an(Array)
26
+ expect(categories).to include("Category")
27
+ expect(categories).to include("カテゴリ") # Japanese
28
+ expect(categories).to include("분류") # Korean (Hangul)
29
+ expect(categories).to include("分類") # Chinese Traditional
30
+ end
31
+
32
+ it "contains file namespaces" do
33
+ data = Wp2txt.load_mediawiki_data
34
+ files = data.dig("namespaces", "file")
35
+ expect(files).to be_an(Array)
36
+ expect(files).to include("File")
37
+ expect(files).to include("Image")
38
+ end
39
+ end
40
+ end
41
+
42
+ RSpec.describe "Wp2txt Regex Patterns" do
43
+ # Define local references to module constants
44
+ let(:remove_hr_regex) { Wp2txt::REMOVE_HR_REGEX }
45
+ let(:in_heading_regex) { Wp2txt::IN_HEADING_REGEX }
46
+ let(:redirect_regex) { Wp2txt::REDIRECT_REGEX }
47
+ let(:category_regex) { Wp2txt::CATEGORY_REGEX }
48
+ let(:in_link_regex) { Wp2txt::IN_LINK_REGEX }
49
+ let(:ml_template_onset_regex) { Wp2txt::ML_TEMPLATE_ONSET_REGEX }
50
+ let(:ml_template_end_regex) { Wp2txt::ML_TEMPLATE_END_REGEX }
51
+ let(:blank_line_regex) { Wp2txt::BLANK_LINE_REGEX }
52
+ let(:isolated_template_regex) { Wp2txt::ISOLATED_TEMPLATE_REGEX }
53
+ let(:chrref_to_utf_regex) { Wp2txt::CHRREF_TO_UTF_REGEX }
54
+ let(:in_table_regex1) { Wp2txt::IN_TABLE_REGEX1 }
55
+ let(:in_table_regex2) { Wp2txt::IN_TABLE_REGEX2 }
56
+ let(:in_unordered_regex) { Wp2txt::IN_UNORDERED_REGEX }
57
+ let(:in_ordered_regex) { Wp2txt::IN_ORDERED_REGEX }
58
+ let(:in_definition_regex) { Wp2txt::IN_DEFINITION_REGEX }
59
+
60
+ describe "REMOVE_HR_REGEX" do
61
+ it "matches horizontal rules with 4+ hyphens" do
62
+ expect("----").to match(remove_hr_regex)
63
+ expect("----------").to match(remove_hr_regex)
64
+ expect(" ---- ").to match(remove_hr_regex)
65
+ end
66
+
67
+ it "does NOT match fewer than 4 hyphens" do
68
+ # These tests will FAIL with current implementation (exposing the bug)
69
+ expect("-").not_to match(remove_hr_regex)
70
+ expect("--").not_to match(remove_hr_regex)
71
+ expect("---").not_to match(remove_hr_regex)
72
+ end
73
+ end
74
+
75
+ describe "IN_HEADING_REGEX" do
76
+ it "matches valid headings with equal = counts" do
77
+ expect("== Title ==").to match(in_heading_regex)
78
+ expect("=== Section ===").to match(in_heading_regex)
79
+ expect("==== Subsection ====").to match(in_heading_regex)
80
+ end
81
+
82
+ # These tests document the expected behavior after fix
83
+ # Current implementation may not enforce matching = counts
84
+ it "handles headings with trailing whitespace" do
85
+ expect("== Title == ").to match(in_heading_regex)
86
+ end
87
+ end
88
+
89
+ describe "REDIRECT_REGEX" do
90
+ it "captures English redirect target correctly" do
91
+ match = "#REDIRECT [[Target Page]]".match(redirect_regex)
92
+ expect(match).not_to be_nil
93
+ expect(match[1]).to eq "Target Page"
94
+ end
95
+
96
+ it "handles Japanese redirect" do
97
+ match = "#転送 [[日本語ページ]]".match(redirect_regex)
98
+ expect(match).not_to be_nil
99
+ expect(match[1]).to eq "日本語ページ"
100
+ end
101
+
102
+ it "is case-insensitive for REDIRECT" do
103
+ match = "#redirect [[Page]]".match(redirect_regex)
104
+ expect(match).not_to be_nil
105
+ end
106
+
107
+ it "handles German redirect" do
108
+ match = "#WEITERLEITUNG [[Zielseite]]".match(redirect_regex)
109
+ expect(match).not_to be_nil
110
+ expect(match[1]).to eq "Zielseite"
111
+ end
112
+
113
+ it "handles French redirect" do
114
+ match = "#REDIRECTION [[Page cible]]".match(redirect_regex)
115
+ expect(match).not_to be_nil
116
+ expect(match[1]).to eq "Page cible"
117
+ end
118
+
119
+ it "handles Russian redirect" do
120
+ match = "#ПЕРЕНАПРАВЛЕНИЕ [[Целевая страница]]".match(redirect_regex)
121
+ expect(match).not_to be_nil
122
+ expect(match[1]).to eq "Целевая страница"
123
+ end
124
+
125
+ it "handles Chinese redirect" do
126
+ match = "#重定向 [[目标页面]]".match(redirect_regex)
127
+ expect(match).not_to be_nil
128
+ expect(match[1]).to eq "目标页面"
129
+ end
130
+
131
+ it "handles Korean redirect" do
132
+ match = "#넘겨주기 [[대상 문서]]".match(redirect_regex)
133
+ expect(match).not_to be_nil
134
+ expect(match[1]).to eq "대상 문서"
135
+ end
136
+
137
+ it "handles full-width # in Japanese redirect" do
138
+ match = "#転送 [[日本語ページ]]".match(redirect_regex)
139
+ expect(match).not_to be_nil
140
+ expect(match[1]).to eq "日本語ページ"
141
+ end
142
+
143
+ it "handles Arabic redirect" do
144
+ match = "#تحويل [[الصفحة المستهدفة]]".match(redirect_regex)
145
+ expect(match).not_to be_nil
146
+ end
147
+
148
+ it "handles Hindi redirect" do
149
+ match = "#पुनर्प्रेषित [[लक्ष्य पृष्ठ]]".match(redirect_regex)
150
+ expect(match).not_to be_nil
151
+ end
152
+ end
153
+
154
+ describe "CATEGORY_REGEX" do
155
+ it "matches English categories" do
156
+ expect("[[Category:Science]]").to match(category_regex)
157
+ end
158
+
159
+ it "matches Italian/Spanish categories" do
160
+ expect("[[Categoria:Scienza]]").to match(category_regex)
161
+ end
162
+
163
+ it "matches Japanese categories" do
164
+ expect("[[カテゴリ:科学]]").to match(category_regex)
165
+ end
166
+
167
+ it "matches German categories" do
168
+ expect("[[Kategorie:Wissenschaft]]").to match(category_regex)
169
+ end
170
+
171
+ it "matches French categories" do
172
+ expect("[[Catégorie:Science]]").to match(category_regex)
173
+ end
174
+
175
+ it "matches Chinese categories" do
176
+ expect("[[分类:科学]]").to match(category_regex)
177
+ expect("[[分類:科學]]").to match(category_regex)
178
+ end
179
+
180
+ it "matches Russian categories" do
181
+ expect("[[Категория:Наука]]").to match(category_regex)
182
+ end
183
+
184
+ it "matches Korean categories" do
185
+ expect("[[분류:과학]]").to match(category_regex)
186
+ end
187
+
188
+ it "matches Arabic categories" do
189
+ expect("[[تصنيف:علم]]").to match(category_regex)
190
+ end
191
+ end
192
+
193
+ describe "IN_LINK_REGEX" do
194
+ it "matches wikilinks on their own line" do
195
+ expect("[[Article]]").to match(in_link_regex)
196
+ end
197
+
198
+ it "matches wikilinks with leading/trailing whitespace" do
199
+ expect(" [[Page|Text]] ").to match(in_link_regex)
200
+ end
201
+ end
202
+
203
+ describe "ML_TEMPLATE_ONSET_REGEX" do
204
+ it "matches opening of multi-line templates" do
205
+ expect("{{Infobox").to match(ml_template_onset_regex)
206
+ expect("{{Template name").to match(ml_template_onset_regex)
207
+ end
208
+
209
+ it "does not match complete templates" do
210
+ expect("{{Complete}}").not_to match(ml_template_onset_regex)
211
+ end
212
+ end
213
+
214
+ describe "ML_TEMPLATE_END_REGEX" do
215
+ it "matches closing of multi-line templates" do
216
+ expect("}}").to match(ml_template_end_regex)
217
+ expect("}} ").to match(ml_template_end_regex)
218
+ expect("content}}").to match(ml_template_end_regex)
219
+ end
220
+ end
221
+
222
+ describe "BLANK_LINE_REGEX" do
223
+ it "matches empty lines" do
224
+ expect("").to match(blank_line_regex)
225
+ expect(" ").to match(blank_line_regex)
226
+ expect("\t").to match(blank_line_regex)
227
+ end
228
+
229
+ it "does not match lines with content" do
230
+ expect("text").not_to match(blank_line_regex)
231
+ end
232
+ end
233
+
234
+ describe "ISOLATED_TEMPLATE_REGEX" do
235
+ it "matches single-line templates" do
236
+ expect("{{Template}}").to match(isolated_template_regex)
237
+ expect(" {{Template|param}} ").to match(isolated_template_regex)
238
+ end
239
+ end
240
+
241
+ describe "CHRREF_TO_UTF_REGEX" do
242
+ it "matches decimal character references" do
243
+ expect("A").to match(chrref_to_utf_regex)
244
+ expect("♪").to match(chrref_to_utf_regex)
245
+ end
246
+
247
+ it "matches hexadecimal character references" do
248
+ expect("A").to match(chrref_to_utf_regex)
249
+ expect("♪").to match(chrref_to_utf_regex)
250
+ expect("😀").to match(chrref_to_utf_regex)
251
+ end
252
+ end
253
+
254
+ describe "IN_TABLE_REGEX1 and IN_TABLE_REGEX2" do
255
+ it "matches MediaWiki table start" do
256
+ expect("{|").to match(in_table_regex1)
257
+ expect(" {|").to match(in_table_regex1)
258
+ end
259
+
260
+ it "matches MediaWiki table end" do
261
+ expect("|}").to match(in_table_regex2)
262
+ end
263
+ end
264
+
265
+ describe "List detection regexes" do
266
+ it "detects unordered list items" do
267
+ expect("* Item").to match(in_unordered_regex)
268
+ expect("** Nested").to match(in_unordered_regex)
269
+ end
270
+
271
+ it "detects ordered list items" do
272
+ expect("# Item").to match(in_ordered_regex)
273
+ expect("## Nested").to match(in_ordered_regex)
274
+ end
275
+
276
+ it "detects definition list items" do
277
+ expect("; Term").to match(in_definition_regex)
278
+ expect(": Definition").to match(in_definition_regex)
279
+ end
280
+ end
281
+ end
@@ -0,0 +1,397 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "spec_helper"
4
+ require "tmpdir"
5
+ require "fileutils"
6
+ require_relative "../lib/wp2txt/article"
7
+ require_relative "../lib/wp2txt/section_extractor"
8
+
9
+ RSpec.describe Wp2txt::SectionExtractor do
10
+ let(:sample_wiki_text) do
11
+ <<~WIKI
12
+ This is the summary text before any headings.
13
+
14
+ == Early life ==
15
+ Born in Tokyo, Japan.
16
+
17
+ == Career ==
18
+ Started working in 2010.
19
+
20
+ === Publications ===
21
+ First paper (2010)
22
+ Second paper (2015)
23
+
24
+ == Reception ==
25
+ The work was well received.
26
+
27
+ == References ==
28
+ <ref>Citation</ref>
29
+
30
+ [[Category:Scientists]]
31
+ [[Category:1990 births]]
32
+ WIKI
33
+ end
34
+
35
+ let(:article) { Wp2txt::Article.new(sample_wiki_text, "Test Person") }
36
+ let(:extractor) { described_class.new }
37
+
38
+ describe "#extract_headings" do
39
+ it "extracts all section headings" do
40
+ headings = extractor.extract_headings(article)
41
+ expect(headings).to eq(["Early life", "Career", "Publications", "Reception", "References"])
42
+ end
43
+
44
+ it "returns empty array for article without headings" do
45
+ simple_article = Wp2txt::Article.new("Just some text.", "Simple")
46
+ headings = extractor.extract_headings(simple_article)
47
+ expect(headings).to eq([])
48
+ end
49
+ end
50
+
51
+ describe "#extract_headings_with_levels" do
52
+ it "extracts headings with their levels" do
53
+ headings = extractor.extract_headings_with_levels(article)
54
+
55
+ expect(headings).to include(
56
+ { name: "Early life", level: 2 },
57
+ { name: "Career", level: 2 },
58
+ { name: "Publications", level: 3 },
59
+ { name: "Reception", level: 2 }
60
+ )
61
+ end
62
+
63
+ it "correctly identifies level 3 subsections" do
64
+ headings = extractor.extract_headings_with_levels(article)
65
+ publications = headings.find { |h| h[:name] == "Publications" }
66
+ expect(publications[:level]).to eq(3)
67
+ end
68
+ end
69
+
70
+ describe "#extract_summary" do
71
+ it "extracts text before first heading" do
72
+ summary = extractor.extract_summary(article)
73
+ expect(summary).to include("This is the summary text")
74
+ end
75
+
76
+ it "returns nil for article starting with heading" do
77
+ no_summary_article = Wp2txt::Article.new("== Heading ==\nContent", "No Summary")
78
+ summary = extractor.extract_summary(no_summary_article)
79
+ expect(summary).to be_nil
80
+ end
81
+ end
82
+
83
+ describe "#extract_sections with targets" do
84
+ context "when extracting summary and specific sections" do
85
+ let(:extractor) { described_class.new(["summary", "Career", "Plot"]) }
86
+
87
+ it "includes summary when requested" do
88
+ sections = extractor.extract_sections(article)
89
+ expect(sections["summary"]).to include("summary text")
90
+ end
91
+
92
+ it "includes matching sections" do
93
+ sections = extractor.extract_sections(article)
94
+ expect(sections["Career"]).to include("Started working")
95
+ end
96
+
97
+ it "returns nil for non-existent sections" do
98
+ sections = extractor.extract_sections(article)
99
+ expect(sections["Plot"]).to be_nil
100
+ end
101
+
102
+ it "includes subsections in parent section" do
103
+ sections = extractor.extract_sections(article)
104
+ expect(sections["Career"]).to include("First paper")
105
+ end
106
+ end
107
+
108
+ context "with minimum length filter" do
109
+ let(:extractor) { described_class.new(["summary", "Career"], min_length: 50) }
110
+
111
+ it "filters out short sections but keeps long ones" do
112
+ sections = extractor.extract_sections(article)
113
+ # Career section with subsections should be long enough (>50 chars)
114
+ expect(sections["Career"]).not_to be_nil
115
+ end
116
+
117
+ it "filters out sections shorter than min_length" do
118
+ strict_extractor = described_class.new(["summary", "Early life"], min_length: 100)
119
+ sections = strict_extractor.extract_sections(article)
120
+ # Early life section is short ("Born in Tokyo, Japan.")
121
+ expect(sections["Early life"]).to be_nil
122
+ end
123
+ end
124
+ end
125
+
126
+ describe "alias matching" do
127
+ context "with default aliases" do
128
+ let(:wiki_with_synopsis) do
129
+ <<~WIKI
130
+ Summary.
131
+
132
+ == Synopsis ==
133
+ The story follows...
134
+ WIKI
135
+ end
136
+ let(:synopsis_article) { Wp2txt::Article.new(wiki_with_synopsis, "Movie") }
137
+ let(:extractor) { described_class.new(["Plot"]) }
138
+
139
+ it "matches Synopsis as alias for Plot" do
140
+ sections = extractor.extract_sections(synopsis_article)
141
+ expect(sections["Plot"]).to include("story follows")
142
+ end
143
+ end
144
+
145
+ context "with aliases disabled" do
146
+ let(:wiki_with_synopsis) do
147
+ <<~WIKI
148
+ Summary.
149
+
150
+ == Synopsis ==
151
+ The story follows...
152
+ WIKI
153
+ end
154
+ let(:synopsis_article) { Wp2txt::Article.new(wiki_with_synopsis, "Movie") }
155
+ let(:extractor) { described_class.new(["Plot"], use_aliases: false) }
156
+
157
+ it "does not match Synopsis when aliases are disabled" do
158
+ sections = extractor.extract_sections(synopsis_article)
159
+ expect(sections["Plot"]).to be_nil
160
+ end
161
+ end
162
+ end
163
+
164
+ describe "case-insensitive matching" do
165
+ let(:extractor) { described_class.new(["early life", "CAREER"]) }
166
+
167
+ it "matches sections regardless of case" do
168
+ sections = extractor.extract_sections(article)
169
+ expect(sections["early life"]).to include("Born in Tokyo")
170
+ expect(sections["CAREER"]).to include("Started working")
171
+ end
172
+ end
173
+
174
+ describe "#has_matching_sections?" do
175
+ context "with matching sections" do
176
+ let(:extractor) { described_class.new(["Career"]) }
177
+
178
+ it "returns true" do
179
+ expect(extractor.has_matching_sections?(article)).to be true
180
+ end
181
+ end
182
+
183
+ context "with no matching sections" do
184
+ let(:extractor) { described_class.new(["Plot", "Gameplay"]) }
185
+
186
+ it "returns false" do
187
+ expect(extractor.has_matching_sections?(article)).to be false
188
+ end
189
+ end
190
+
191
+ context "when summary is requested and exists" do
192
+ let(:extractor) { described_class.new(["summary"]) }
193
+
194
+ it "returns true" do
195
+ expect(extractor.has_matching_sections?(article)).to be true
196
+ end
197
+ end
198
+ end
199
+
200
+ describe "#should_skip?" do
201
+ context "with skip_empty: false (default)" do
202
+ let(:extractor) { described_class.new(["Plot"], skip_empty: false) }
203
+
204
+ it "returns false even when no sections match" do
205
+ expect(extractor.should_skip?(article)).to be false
206
+ end
207
+ end
208
+
209
+ context "with skip_empty: true" do
210
+ let(:extractor) { described_class.new(["Plot"], skip_empty: true) }
211
+
212
+ it "returns true when no sections match" do
213
+ expect(extractor.should_skip?(article)).to be true
214
+ end
215
+
216
+ it "returns false when sections match" do
217
+ extractor_with_match = described_class.new(["Career"], skip_empty: true)
218
+ expect(extractor_with_match.should_skip?(article)).to be false
219
+ end
220
+ end
221
+ end
222
+
223
+ describe "alias file loading" do
224
+ let(:temp_dir) { Dir.mktmpdir }
225
+ let(:alias_file) { File.join(temp_dir, "aliases.yml") }
226
+
227
+ after { FileUtils.remove_entry(temp_dir) }
228
+
229
+ context "with valid YAML alias file" do
230
+ before do
231
+ File.write(alias_file, <<~YAML)
232
+ Career:
233
+ - Work history
234
+ - Employment
235
+ Plot:
236
+ - Synopsis
237
+ - Story
238
+ YAML
239
+ end
240
+
241
+ it "loads aliases from file" do
242
+ aliases = described_class.load_aliases_from_file(alias_file)
243
+ expect(aliases["Career"]).to eq(["Work history", "Employment"])
244
+ expect(aliases["Plot"]).to eq(["Synopsis", "Story"])
245
+ end
246
+
247
+ it "merges file aliases with defaults" do
248
+ extractor = described_class.new(["Plot"], alias_file: alias_file)
249
+ # Should have both default "Synopsis" and file "Story" as aliases
250
+ wiki_with_story = <<~WIKI
251
+ == Story ==
252
+ The story begins...
253
+ WIKI
254
+ story_article = Wp2txt::Article.new(wiki_with_story, "Film")
255
+ sections = extractor.extract_sections(story_article)
256
+ expect(sections["Plot"]).to include("story begins")
257
+ end
258
+ end
259
+
260
+ context "with non-existent file" do
261
+ it "returns empty hash" do
262
+ aliases = described_class.load_aliases_from_file("/nonexistent/file.yml")
263
+ expect(aliases).to eq({})
264
+ end
265
+ end
266
+
267
+ context "with invalid YAML" do
268
+ before { File.write(alias_file, "invalid: yaml: syntax: {{") }
269
+
270
+ it "returns empty hash" do
271
+ aliases = described_class.load_aliases_from_file(alias_file)
272
+ expect(aliases).to eq({})
273
+ end
274
+ end
275
+ end
276
+
277
+ describe "matched sections tracking" do
278
+ let(:wiki_with_synopsis) do
279
+ <<~WIKI
280
+ Summary text.
281
+ == Synopsis ==
282
+ The story begins...
283
+ WIKI
284
+ end
285
+ let(:synopsis_article) { Wp2txt::Article.new(wiki_with_synopsis, "Movie") }
286
+
287
+ context "with track_matches enabled" do
288
+ let(:extractor) { described_class.new(["Plot"], track_matches: true) }
289
+
290
+ it "records alias matches" do
291
+ extractor.extract_sections(synopsis_article)
292
+ expect(extractor.matched_sections["Plot"]).to eq("Synopsis")
293
+ end
294
+ end
295
+
296
+ context "with track_matches disabled (default)" do
297
+ let(:extractor) { described_class.new(["Plot"], track_matches: false) }
298
+
299
+ it "does not record matches" do
300
+ extractor.extract_sections(synopsis_article)
301
+ expect(extractor.matched_sections).to be_empty
302
+ end
303
+ end
304
+
305
+ context "with direct match (different case)" do
306
+ let(:wiki_text) { "== plot ==\nContent here." }
307
+ let(:plot_article) { Wp2txt::Article.new(wiki_text, "Film") }
308
+ let(:extractor) { described_class.new(["Plot"], track_matches: true) }
309
+
310
+ it "records case-different direct matches" do
311
+ extractor.extract_sections(plot_article)
312
+ expect(extractor.matched_sections["Plot"]).to eq("plot")
313
+ end
314
+ end
315
+ end
316
+ end
317
+
318
+ RSpec.describe Wp2txt::SectionStatsCollector do
319
+ let(:sample_wiki_text) do
320
+ <<~WIKI
321
+ Summary.
322
+ == Early life ==
323
+ Content.
324
+ == Career ==
325
+ Content.
326
+ WIKI
327
+ end
328
+
329
+ let(:another_wiki_text) do
330
+ <<~WIKI
331
+ Summary.
332
+ == Career ==
333
+ Content.
334
+ == Reception ==
335
+ Content.
336
+ WIKI
337
+ end
338
+
339
+ let(:article1) { Wp2txt::Article.new(sample_wiki_text, "Person 1") }
340
+ let(:article2) { Wp2txt::Article.new(another_wiki_text, "Work 1") }
341
+
342
+ describe "#process" do
343
+ it "counts articles" do
344
+ collector = described_class.new
345
+ collector.process(article1)
346
+ collector.process(article2)
347
+ expect(collector.total_articles).to eq(2)
348
+ end
349
+
350
+ it "counts section occurrences" do
351
+ collector = described_class.new
352
+ collector.process(article1)
353
+ collector.process(article2)
354
+
355
+ expect(collector.section_counts["Career"]).to eq(2)
356
+ expect(collector.section_counts["Early life"]).to eq(1)
357
+ expect(collector.section_counts["Reception"]).to eq(1)
358
+ end
359
+ end
360
+
361
+ describe "#top_sections" do
362
+ it "returns sections sorted by count" do
363
+ collector = described_class.new
364
+ collector.process(article1)
365
+ collector.process(article2)
366
+
367
+ top = collector.top_sections(2)
368
+ expect(top.first["name"]).to eq("Career")
369
+ expect(top.first["count"]).to eq(2)
370
+ expect(top.length).to eq(2)
371
+ end
372
+ end
373
+
374
+ describe "#to_hash" do
375
+ it "returns statistics as hash" do
376
+ collector = described_class.new
377
+ collector.process(article1)
378
+ collector.process(article2)
379
+
380
+ result = collector.to_hash(top_n: 5)
381
+ expect(result["total_articles"]).to eq(2)
382
+ expect(result["section_counts"]).to be_a(Hash)
383
+ expect(result["top_sections"]).to be_an(Array)
384
+ end
385
+ end
386
+
387
+ describe "#to_json" do
388
+ it "returns valid JSON" do
389
+ collector = described_class.new
390
+ collector.process(article1)
391
+
392
+ json = collector.to_json
393
+ parsed = JSON.parse(json)
394
+ expect(parsed["total_articles"]).to eq(1)
395
+ end
396
+ end
397
+ end