wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
data/spec/regex_spec.rb
ADDED
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "spec_helper"
|
|
4
|
+
|
|
5
|
+
RSpec.describe "Wp2txt MediaWiki Data Loading" do
|
|
6
|
+
describe "load_mediawiki_data" do
|
|
7
|
+
it "loads data from JSON file" do
|
|
8
|
+
data = Wp2txt.load_mediawiki_data
|
|
9
|
+
expect(data).to be_a(Hash)
|
|
10
|
+
expect(data).to have_key("magic_words")
|
|
11
|
+
expect(data).to have_key("namespaces")
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
it "contains redirect keywords" do
|
|
15
|
+
data = Wp2txt.load_mediawiki_data
|
|
16
|
+
redirects = data.dig("magic_words", "redirect")
|
|
17
|
+
expect(redirects).to be_an(Array)
|
|
18
|
+
expect(redirects).to include("REDIRECT")
|
|
19
|
+
expect(redirects.size).to be > 100 # Should have many languages
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
it "contains category namespaces" do
|
|
23
|
+
data = Wp2txt.load_mediawiki_data
|
|
24
|
+
categories = data.dig("namespaces", "category")
|
|
25
|
+
expect(categories).to be_an(Array)
|
|
26
|
+
expect(categories).to include("Category")
|
|
27
|
+
expect(categories).to include("カテゴリ") # Japanese
|
|
28
|
+
expect(categories).to include("분류") # Korean (Hangul)
|
|
29
|
+
expect(categories).to include("分類") # Chinese Traditional
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
it "contains file namespaces" do
|
|
33
|
+
data = Wp2txt.load_mediawiki_data
|
|
34
|
+
files = data.dig("namespaces", "file")
|
|
35
|
+
expect(files).to be_an(Array)
|
|
36
|
+
expect(files).to include("File")
|
|
37
|
+
expect(files).to include("Image")
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
RSpec.describe "Wp2txt Regex Patterns" do
|
|
43
|
+
# Define local references to module constants
|
|
44
|
+
let(:remove_hr_regex) { Wp2txt::REMOVE_HR_REGEX }
|
|
45
|
+
let(:in_heading_regex) { Wp2txt::IN_HEADING_REGEX }
|
|
46
|
+
let(:redirect_regex) { Wp2txt::REDIRECT_REGEX }
|
|
47
|
+
let(:category_regex) { Wp2txt::CATEGORY_REGEX }
|
|
48
|
+
let(:in_link_regex) { Wp2txt::IN_LINK_REGEX }
|
|
49
|
+
let(:ml_template_onset_regex) { Wp2txt::ML_TEMPLATE_ONSET_REGEX }
|
|
50
|
+
let(:ml_template_end_regex) { Wp2txt::ML_TEMPLATE_END_REGEX }
|
|
51
|
+
let(:blank_line_regex) { Wp2txt::BLANK_LINE_REGEX }
|
|
52
|
+
let(:isolated_template_regex) { Wp2txt::ISOLATED_TEMPLATE_REGEX }
|
|
53
|
+
let(:chrref_to_utf_regex) { Wp2txt::CHRREF_TO_UTF_REGEX }
|
|
54
|
+
let(:in_table_regex1) { Wp2txt::IN_TABLE_REGEX1 }
|
|
55
|
+
let(:in_table_regex2) { Wp2txt::IN_TABLE_REGEX2 }
|
|
56
|
+
let(:in_unordered_regex) { Wp2txt::IN_UNORDERED_REGEX }
|
|
57
|
+
let(:in_ordered_regex) { Wp2txt::IN_ORDERED_REGEX }
|
|
58
|
+
let(:in_definition_regex) { Wp2txt::IN_DEFINITION_REGEX }
|
|
59
|
+
|
|
60
|
+
describe "REMOVE_HR_REGEX" do
|
|
61
|
+
it "matches horizontal rules with 4+ hyphens" do
|
|
62
|
+
expect("----").to match(remove_hr_regex)
|
|
63
|
+
expect("----------").to match(remove_hr_regex)
|
|
64
|
+
expect(" ---- ").to match(remove_hr_regex)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
it "does NOT match fewer than 4 hyphens" do
|
|
68
|
+
# These tests will FAIL with current implementation (exposing the bug)
|
|
69
|
+
expect("-").not_to match(remove_hr_regex)
|
|
70
|
+
expect("--").not_to match(remove_hr_regex)
|
|
71
|
+
expect("---").not_to match(remove_hr_regex)
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
describe "IN_HEADING_REGEX" do
|
|
76
|
+
it "matches valid headings with equal = counts" do
|
|
77
|
+
expect("== Title ==").to match(in_heading_regex)
|
|
78
|
+
expect("=== Section ===").to match(in_heading_regex)
|
|
79
|
+
expect("==== Subsection ====").to match(in_heading_regex)
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# These tests document the expected behavior after fix
|
|
83
|
+
# Current implementation may not enforce matching = counts
|
|
84
|
+
it "handles headings with trailing whitespace" do
|
|
85
|
+
expect("== Title == ").to match(in_heading_regex)
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
describe "REDIRECT_REGEX" do
|
|
90
|
+
it "captures English redirect target correctly" do
|
|
91
|
+
match = "#REDIRECT [[Target Page]]".match(redirect_regex)
|
|
92
|
+
expect(match).not_to be_nil
|
|
93
|
+
expect(match[1]).to eq "Target Page"
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
it "handles Japanese redirect" do
|
|
97
|
+
match = "#転送 [[日本語ページ]]".match(redirect_regex)
|
|
98
|
+
expect(match).not_to be_nil
|
|
99
|
+
expect(match[1]).to eq "日本語ページ"
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
it "is case-insensitive for REDIRECT" do
|
|
103
|
+
match = "#redirect [[Page]]".match(redirect_regex)
|
|
104
|
+
expect(match).not_to be_nil
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
it "handles German redirect" do
|
|
108
|
+
match = "#WEITERLEITUNG [[Zielseite]]".match(redirect_regex)
|
|
109
|
+
expect(match).not_to be_nil
|
|
110
|
+
expect(match[1]).to eq "Zielseite"
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
it "handles French redirect" do
|
|
114
|
+
match = "#REDIRECTION [[Page cible]]".match(redirect_regex)
|
|
115
|
+
expect(match).not_to be_nil
|
|
116
|
+
expect(match[1]).to eq "Page cible"
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
it "handles Russian redirect" do
|
|
120
|
+
match = "#ПЕРЕНАПРАВЛЕНИЕ [[Целевая страница]]".match(redirect_regex)
|
|
121
|
+
expect(match).not_to be_nil
|
|
122
|
+
expect(match[1]).to eq "Целевая страница"
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
it "handles Chinese redirect" do
|
|
126
|
+
match = "#重定向 [[目标页面]]".match(redirect_regex)
|
|
127
|
+
expect(match).not_to be_nil
|
|
128
|
+
expect(match[1]).to eq "目标页面"
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
it "handles Korean redirect" do
|
|
132
|
+
match = "#넘겨주기 [[대상 문서]]".match(redirect_regex)
|
|
133
|
+
expect(match).not_to be_nil
|
|
134
|
+
expect(match[1]).to eq "대상 문서"
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
it "handles full-width # in Japanese redirect" do
|
|
138
|
+
match = "#転送 [[日本語ページ]]".match(redirect_regex)
|
|
139
|
+
expect(match).not_to be_nil
|
|
140
|
+
expect(match[1]).to eq "日本語ページ"
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
it "handles Arabic redirect" do
|
|
144
|
+
match = "#تحويل [[الصفحة المستهدفة]]".match(redirect_regex)
|
|
145
|
+
expect(match).not_to be_nil
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
it "handles Hindi redirect" do
|
|
149
|
+
match = "#पुनर्प्रेषित [[लक्ष्य पृष्ठ]]".match(redirect_regex)
|
|
150
|
+
expect(match).not_to be_nil
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
describe "CATEGORY_REGEX" do
|
|
155
|
+
it "matches English categories" do
|
|
156
|
+
expect("[[Category:Science]]").to match(category_regex)
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
it "matches Italian/Spanish categories" do
|
|
160
|
+
expect("[[Categoria:Scienza]]").to match(category_regex)
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
it "matches Japanese categories" do
|
|
164
|
+
expect("[[カテゴリ:科学]]").to match(category_regex)
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
it "matches German categories" do
|
|
168
|
+
expect("[[Kategorie:Wissenschaft]]").to match(category_regex)
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
it "matches French categories" do
|
|
172
|
+
expect("[[Catégorie:Science]]").to match(category_regex)
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
it "matches Chinese categories" do
|
|
176
|
+
expect("[[分类:科学]]").to match(category_regex)
|
|
177
|
+
expect("[[分類:科學]]").to match(category_regex)
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
it "matches Russian categories" do
|
|
181
|
+
expect("[[Категория:Наука]]").to match(category_regex)
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
it "matches Korean categories" do
|
|
185
|
+
expect("[[분류:과학]]").to match(category_regex)
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
it "matches Arabic categories" do
|
|
189
|
+
expect("[[تصنيف:علم]]").to match(category_regex)
|
|
190
|
+
end
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
describe "IN_LINK_REGEX" do
|
|
194
|
+
it "matches wikilinks on their own line" do
|
|
195
|
+
expect("[[Article]]").to match(in_link_regex)
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
it "matches wikilinks with leading/trailing whitespace" do
|
|
199
|
+
expect(" [[Page|Text]] ").to match(in_link_regex)
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
describe "ML_TEMPLATE_ONSET_REGEX" do
|
|
204
|
+
it "matches opening of multi-line templates" do
|
|
205
|
+
expect("{{Infobox").to match(ml_template_onset_regex)
|
|
206
|
+
expect("{{Template name").to match(ml_template_onset_regex)
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
it "does not match complete templates" do
|
|
210
|
+
expect("{{Complete}}").not_to match(ml_template_onset_regex)
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
describe "ML_TEMPLATE_END_REGEX" do
|
|
215
|
+
it "matches closing of multi-line templates" do
|
|
216
|
+
expect("}}").to match(ml_template_end_regex)
|
|
217
|
+
expect("}} ").to match(ml_template_end_regex)
|
|
218
|
+
expect("content}}").to match(ml_template_end_regex)
|
|
219
|
+
end
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
describe "BLANK_LINE_REGEX" do
|
|
223
|
+
it "matches empty lines" do
|
|
224
|
+
expect("").to match(blank_line_regex)
|
|
225
|
+
expect(" ").to match(blank_line_regex)
|
|
226
|
+
expect("\t").to match(blank_line_regex)
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
it "does not match lines with content" do
|
|
230
|
+
expect("text").not_to match(blank_line_regex)
|
|
231
|
+
end
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
describe "ISOLATED_TEMPLATE_REGEX" do
|
|
235
|
+
it "matches single-line templates" do
|
|
236
|
+
expect("{{Template}}").to match(isolated_template_regex)
|
|
237
|
+
expect(" {{Template|param}} ").to match(isolated_template_regex)
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
describe "CHRREF_TO_UTF_REGEX" do
|
|
242
|
+
it "matches decimal character references" do
|
|
243
|
+
expect("A").to match(chrref_to_utf_regex)
|
|
244
|
+
expect("♪").to match(chrref_to_utf_regex)
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
it "matches hexadecimal character references" do
|
|
248
|
+
expect("A").to match(chrref_to_utf_regex)
|
|
249
|
+
expect("♪").to match(chrref_to_utf_regex)
|
|
250
|
+
expect("😀").to match(chrref_to_utf_regex)
|
|
251
|
+
end
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
describe "IN_TABLE_REGEX1 and IN_TABLE_REGEX2" do
|
|
255
|
+
it "matches MediaWiki table start" do
|
|
256
|
+
expect("{|").to match(in_table_regex1)
|
|
257
|
+
expect(" {|").to match(in_table_regex1)
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
it "matches MediaWiki table end" do
|
|
261
|
+
expect("|}").to match(in_table_regex2)
|
|
262
|
+
end
|
|
263
|
+
end
|
|
264
|
+
|
|
265
|
+
describe "List detection regexes" do
|
|
266
|
+
it "detects unordered list items" do
|
|
267
|
+
expect("* Item").to match(in_unordered_regex)
|
|
268
|
+
expect("** Nested").to match(in_unordered_regex)
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
it "detects ordered list items" do
|
|
272
|
+
expect("# Item").to match(in_ordered_regex)
|
|
273
|
+
expect("## Nested").to match(in_ordered_regex)
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
it "detects definition list items" do
|
|
277
|
+
expect("; Term").to match(in_definition_regex)
|
|
278
|
+
expect(": Definition").to match(in_definition_regex)
|
|
279
|
+
end
|
|
280
|
+
end
|
|
281
|
+
end
|
|
@@ -0,0 +1,397 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "spec_helper"
|
|
4
|
+
require "tmpdir"
|
|
5
|
+
require "fileutils"
|
|
6
|
+
require_relative "../lib/wp2txt/article"
|
|
7
|
+
require_relative "../lib/wp2txt/section_extractor"
|
|
8
|
+
|
|
9
|
+
RSpec.describe Wp2txt::SectionExtractor do
|
|
10
|
+
let(:sample_wiki_text) do
|
|
11
|
+
<<~WIKI
|
|
12
|
+
This is the summary text before any headings.
|
|
13
|
+
|
|
14
|
+
== Early life ==
|
|
15
|
+
Born in Tokyo, Japan.
|
|
16
|
+
|
|
17
|
+
== Career ==
|
|
18
|
+
Started working in 2010.
|
|
19
|
+
|
|
20
|
+
=== Publications ===
|
|
21
|
+
First paper (2010)
|
|
22
|
+
Second paper (2015)
|
|
23
|
+
|
|
24
|
+
== Reception ==
|
|
25
|
+
The work was well received.
|
|
26
|
+
|
|
27
|
+
== References ==
|
|
28
|
+
<ref>Citation</ref>
|
|
29
|
+
|
|
30
|
+
[[Category:Scientists]]
|
|
31
|
+
[[Category:1990 births]]
|
|
32
|
+
WIKI
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
let(:article) { Wp2txt::Article.new(sample_wiki_text, "Test Person") }
|
|
36
|
+
let(:extractor) { described_class.new }
|
|
37
|
+
|
|
38
|
+
describe "#extract_headings" do
|
|
39
|
+
it "extracts all section headings" do
|
|
40
|
+
headings = extractor.extract_headings(article)
|
|
41
|
+
expect(headings).to eq(["Early life", "Career", "Publications", "Reception", "References"])
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
it "returns empty array for article without headings" do
|
|
45
|
+
simple_article = Wp2txt::Article.new("Just some text.", "Simple")
|
|
46
|
+
headings = extractor.extract_headings(simple_article)
|
|
47
|
+
expect(headings).to eq([])
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
describe "#extract_headings_with_levels" do
|
|
52
|
+
it "extracts headings with their levels" do
|
|
53
|
+
headings = extractor.extract_headings_with_levels(article)
|
|
54
|
+
|
|
55
|
+
expect(headings).to include(
|
|
56
|
+
{ name: "Early life", level: 2 },
|
|
57
|
+
{ name: "Career", level: 2 },
|
|
58
|
+
{ name: "Publications", level: 3 },
|
|
59
|
+
{ name: "Reception", level: 2 }
|
|
60
|
+
)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
it "correctly identifies level 3 subsections" do
|
|
64
|
+
headings = extractor.extract_headings_with_levels(article)
|
|
65
|
+
publications = headings.find { |h| h[:name] == "Publications" }
|
|
66
|
+
expect(publications[:level]).to eq(3)
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
describe "#extract_summary" do
|
|
71
|
+
it "extracts text before first heading" do
|
|
72
|
+
summary = extractor.extract_summary(article)
|
|
73
|
+
expect(summary).to include("This is the summary text")
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
it "returns nil for article starting with heading" do
|
|
77
|
+
no_summary_article = Wp2txt::Article.new("== Heading ==\nContent", "No Summary")
|
|
78
|
+
summary = extractor.extract_summary(no_summary_article)
|
|
79
|
+
expect(summary).to be_nil
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
describe "#extract_sections with targets" do
|
|
84
|
+
context "when extracting summary and specific sections" do
|
|
85
|
+
let(:extractor) { described_class.new(["summary", "Career", "Plot"]) }
|
|
86
|
+
|
|
87
|
+
it "includes summary when requested" do
|
|
88
|
+
sections = extractor.extract_sections(article)
|
|
89
|
+
expect(sections["summary"]).to include("summary text")
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
it "includes matching sections" do
|
|
93
|
+
sections = extractor.extract_sections(article)
|
|
94
|
+
expect(sections["Career"]).to include("Started working")
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
it "returns nil for non-existent sections" do
|
|
98
|
+
sections = extractor.extract_sections(article)
|
|
99
|
+
expect(sections["Plot"]).to be_nil
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
it "includes subsections in parent section" do
|
|
103
|
+
sections = extractor.extract_sections(article)
|
|
104
|
+
expect(sections["Career"]).to include("First paper")
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
context "with minimum length filter" do
|
|
109
|
+
let(:extractor) { described_class.new(["summary", "Career"], min_length: 50) }
|
|
110
|
+
|
|
111
|
+
it "filters out short sections but keeps long ones" do
|
|
112
|
+
sections = extractor.extract_sections(article)
|
|
113
|
+
# Career section with subsections should be long enough (>50 chars)
|
|
114
|
+
expect(sections["Career"]).not_to be_nil
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
it "filters out sections shorter than min_length" do
|
|
118
|
+
strict_extractor = described_class.new(["summary", "Early life"], min_length: 100)
|
|
119
|
+
sections = strict_extractor.extract_sections(article)
|
|
120
|
+
# Early life section is short ("Born in Tokyo, Japan.")
|
|
121
|
+
expect(sections["Early life"]).to be_nil
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
describe "alias matching" do
|
|
127
|
+
context "with default aliases" do
|
|
128
|
+
let(:wiki_with_synopsis) do
|
|
129
|
+
<<~WIKI
|
|
130
|
+
Summary.
|
|
131
|
+
|
|
132
|
+
== Synopsis ==
|
|
133
|
+
The story follows...
|
|
134
|
+
WIKI
|
|
135
|
+
end
|
|
136
|
+
let(:synopsis_article) { Wp2txt::Article.new(wiki_with_synopsis, "Movie") }
|
|
137
|
+
let(:extractor) { described_class.new(["Plot"]) }
|
|
138
|
+
|
|
139
|
+
it "matches Synopsis as alias for Plot" do
|
|
140
|
+
sections = extractor.extract_sections(synopsis_article)
|
|
141
|
+
expect(sections["Plot"]).to include("story follows")
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
context "with aliases disabled" do
|
|
146
|
+
let(:wiki_with_synopsis) do
|
|
147
|
+
<<~WIKI
|
|
148
|
+
Summary.
|
|
149
|
+
|
|
150
|
+
== Synopsis ==
|
|
151
|
+
The story follows...
|
|
152
|
+
WIKI
|
|
153
|
+
end
|
|
154
|
+
let(:synopsis_article) { Wp2txt::Article.new(wiki_with_synopsis, "Movie") }
|
|
155
|
+
let(:extractor) { described_class.new(["Plot"], use_aliases: false) }
|
|
156
|
+
|
|
157
|
+
it "does not match Synopsis when aliases are disabled" do
|
|
158
|
+
sections = extractor.extract_sections(synopsis_article)
|
|
159
|
+
expect(sections["Plot"]).to be_nil
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
describe "case-insensitive matching" do
|
|
165
|
+
let(:extractor) { described_class.new(["early life", "CAREER"]) }
|
|
166
|
+
|
|
167
|
+
it "matches sections regardless of case" do
|
|
168
|
+
sections = extractor.extract_sections(article)
|
|
169
|
+
expect(sections["early life"]).to include("Born in Tokyo")
|
|
170
|
+
expect(sections["CAREER"]).to include("Started working")
|
|
171
|
+
end
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
describe "#has_matching_sections?" do
|
|
175
|
+
context "with matching sections" do
|
|
176
|
+
let(:extractor) { described_class.new(["Career"]) }
|
|
177
|
+
|
|
178
|
+
it "returns true" do
|
|
179
|
+
expect(extractor.has_matching_sections?(article)).to be true
|
|
180
|
+
end
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
context "with no matching sections" do
|
|
184
|
+
let(:extractor) { described_class.new(["Plot", "Gameplay"]) }
|
|
185
|
+
|
|
186
|
+
it "returns false" do
|
|
187
|
+
expect(extractor.has_matching_sections?(article)).to be false
|
|
188
|
+
end
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
context "when summary is requested and exists" do
|
|
192
|
+
let(:extractor) { described_class.new(["summary"]) }
|
|
193
|
+
|
|
194
|
+
it "returns true" do
|
|
195
|
+
expect(extractor.has_matching_sections?(article)).to be true
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
describe "#should_skip?" do
|
|
201
|
+
context "with skip_empty: false (default)" do
|
|
202
|
+
let(:extractor) { described_class.new(["Plot"], skip_empty: false) }
|
|
203
|
+
|
|
204
|
+
it "returns false even when no sections match" do
|
|
205
|
+
expect(extractor.should_skip?(article)).to be false
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
context "with skip_empty: true" do
|
|
210
|
+
let(:extractor) { described_class.new(["Plot"], skip_empty: true) }
|
|
211
|
+
|
|
212
|
+
it "returns true when no sections match" do
|
|
213
|
+
expect(extractor.should_skip?(article)).to be true
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
it "returns false when sections match" do
|
|
217
|
+
extractor_with_match = described_class.new(["Career"], skip_empty: true)
|
|
218
|
+
expect(extractor_with_match.should_skip?(article)).to be false
|
|
219
|
+
end
|
|
220
|
+
end
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
describe "alias file loading" do
|
|
224
|
+
let(:temp_dir) { Dir.mktmpdir }
|
|
225
|
+
let(:alias_file) { File.join(temp_dir, "aliases.yml") }
|
|
226
|
+
|
|
227
|
+
after { FileUtils.remove_entry(temp_dir) }
|
|
228
|
+
|
|
229
|
+
context "with valid YAML alias file" do
|
|
230
|
+
before do
|
|
231
|
+
File.write(alias_file, <<~YAML)
|
|
232
|
+
Career:
|
|
233
|
+
- Work history
|
|
234
|
+
- Employment
|
|
235
|
+
Plot:
|
|
236
|
+
- Synopsis
|
|
237
|
+
- Story
|
|
238
|
+
YAML
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
it "loads aliases from file" do
|
|
242
|
+
aliases = described_class.load_aliases_from_file(alias_file)
|
|
243
|
+
expect(aliases["Career"]).to eq(["Work history", "Employment"])
|
|
244
|
+
expect(aliases["Plot"]).to eq(["Synopsis", "Story"])
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
it "merges file aliases with defaults" do
|
|
248
|
+
extractor = described_class.new(["Plot"], alias_file: alias_file)
|
|
249
|
+
# Should have both default "Synopsis" and file "Story" as aliases
|
|
250
|
+
wiki_with_story = <<~WIKI
|
|
251
|
+
== Story ==
|
|
252
|
+
The story begins...
|
|
253
|
+
WIKI
|
|
254
|
+
story_article = Wp2txt::Article.new(wiki_with_story, "Film")
|
|
255
|
+
sections = extractor.extract_sections(story_article)
|
|
256
|
+
expect(sections["Plot"]).to include("story begins")
|
|
257
|
+
end
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
context "with non-existent file" do
|
|
261
|
+
it "returns empty hash" do
|
|
262
|
+
aliases = described_class.load_aliases_from_file("/nonexistent/file.yml")
|
|
263
|
+
expect(aliases).to eq({})
|
|
264
|
+
end
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
context "with invalid YAML" do
|
|
268
|
+
before { File.write(alias_file, "invalid: yaml: syntax: {{") }
|
|
269
|
+
|
|
270
|
+
it "returns empty hash" do
|
|
271
|
+
aliases = described_class.load_aliases_from_file(alias_file)
|
|
272
|
+
expect(aliases).to eq({})
|
|
273
|
+
end
|
|
274
|
+
end
|
|
275
|
+
end
|
|
276
|
+
|
|
277
|
+
describe "matched sections tracking" do
|
|
278
|
+
let(:wiki_with_synopsis) do
|
|
279
|
+
<<~WIKI
|
|
280
|
+
Summary text.
|
|
281
|
+
== Synopsis ==
|
|
282
|
+
The story begins...
|
|
283
|
+
WIKI
|
|
284
|
+
end
|
|
285
|
+
let(:synopsis_article) { Wp2txt::Article.new(wiki_with_synopsis, "Movie") }
|
|
286
|
+
|
|
287
|
+
context "with track_matches enabled" do
|
|
288
|
+
let(:extractor) { described_class.new(["Plot"], track_matches: true) }
|
|
289
|
+
|
|
290
|
+
it "records alias matches" do
|
|
291
|
+
extractor.extract_sections(synopsis_article)
|
|
292
|
+
expect(extractor.matched_sections["Plot"]).to eq("Synopsis")
|
|
293
|
+
end
|
|
294
|
+
end
|
|
295
|
+
|
|
296
|
+
context "with track_matches disabled (default)" do
|
|
297
|
+
let(:extractor) { described_class.new(["Plot"], track_matches: false) }
|
|
298
|
+
|
|
299
|
+
it "does not record matches" do
|
|
300
|
+
extractor.extract_sections(synopsis_article)
|
|
301
|
+
expect(extractor.matched_sections).to be_empty
|
|
302
|
+
end
|
|
303
|
+
end
|
|
304
|
+
|
|
305
|
+
context "with direct match (different case)" do
|
|
306
|
+
let(:wiki_text) { "== plot ==\nContent here." }
|
|
307
|
+
let(:plot_article) { Wp2txt::Article.new(wiki_text, "Film") }
|
|
308
|
+
let(:extractor) { described_class.new(["Plot"], track_matches: true) }
|
|
309
|
+
|
|
310
|
+
it "records case-different direct matches" do
|
|
311
|
+
extractor.extract_sections(plot_article)
|
|
312
|
+
expect(extractor.matched_sections["Plot"]).to eq("plot")
|
|
313
|
+
end
|
|
314
|
+
end
|
|
315
|
+
end
|
|
316
|
+
end
|
|
317
|
+
|
|
318
|
+
RSpec.describe Wp2txt::SectionStatsCollector do
|
|
319
|
+
let(:sample_wiki_text) do
|
|
320
|
+
<<~WIKI
|
|
321
|
+
Summary.
|
|
322
|
+
== Early life ==
|
|
323
|
+
Content.
|
|
324
|
+
== Career ==
|
|
325
|
+
Content.
|
|
326
|
+
WIKI
|
|
327
|
+
end
|
|
328
|
+
|
|
329
|
+
let(:another_wiki_text) do
|
|
330
|
+
<<~WIKI
|
|
331
|
+
Summary.
|
|
332
|
+
== Career ==
|
|
333
|
+
Content.
|
|
334
|
+
== Reception ==
|
|
335
|
+
Content.
|
|
336
|
+
WIKI
|
|
337
|
+
end
|
|
338
|
+
|
|
339
|
+
let(:article1) { Wp2txt::Article.new(sample_wiki_text, "Person 1") }
|
|
340
|
+
let(:article2) { Wp2txt::Article.new(another_wiki_text, "Work 1") }
|
|
341
|
+
|
|
342
|
+
describe "#process" do
|
|
343
|
+
it "counts articles" do
|
|
344
|
+
collector = described_class.new
|
|
345
|
+
collector.process(article1)
|
|
346
|
+
collector.process(article2)
|
|
347
|
+
expect(collector.total_articles).to eq(2)
|
|
348
|
+
end
|
|
349
|
+
|
|
350
|
+
it "counts section occurrences" do
|
|
351
|
+
collector = described_class.new
|
|
352
|
+
collector.process(article1)
|
|
353
|
+
collector.process(article2)
|
|
354
|
+
|
|
355
|
+
expect(collector.section_counts["Career"]).to eq(2)
|
|
356
|
+
expect(collector.section_counts["Early life"]).to eq(1)
|
|
357
|
+
expect(collector.section_counts["Reception"]).to eq(1)
|
|
358
|
+
end
|
|
359
|
+
end
|
|
360
|
+
|
|
361
|
+
describe "#top_sections" do
|
|
362
|
+
it "returns sections sorted by count" do
|
|
363
|
+
collector = described_class.new
|
|
364
|
+
collector.process(article1)
|
|
365
|
+
collector.process(article2)
|
|
366
|
+
|
|
367
|
+
top = collector.top_sections(2)
|
|
368
|
+
expect(top.first["name"]).to eq("Career")
|
|
369
|
+
expect(top.first["count"]).to eq(2)
|
|
370
|
+
expect(top.length).to eq(2)
|
|
371
|
+
end
|
|
372
|
+
end
|
|
373
|
+
|
|
374
|
+
describe "#to_hash" do
|
|
375
|
+
it "returns statistics as hash" do
|
|
376
|
+
collector = described_class.new
|
|
377
|
+
collector.process(article1)
|
|
378
|
+
collector.process(article2)
|
|
379
|
+
|
|
380
|
+
result = collector.to_hash(top_n: 5)
|
|
381
|
+
expect(result["total_articles"]).to eq(2)
|
|
382
|
+
expect(result["section_counts"]).to be_a(Hash)
|
|
383
|
+
expect(result["top_sections"]).to be_an(Array)
|
|
384
|
+
end
|
|
385
|
+
end
|
|
386
|
+
|
|
387
|
+
describe "#to_json" do
|
|
388
|
+
it "returns valid JSON" do
|
|
389
|
+
collector = described_class.new
|
|
390
|
+
collector.process(article1)
|
|
391
|
+
|
|
392
|
+
json = collector.to_json
|
|
393
|
+
parsed = JSON.parse(json)
|
|
394
|
+
expect(parsed["total_articles"]).to eq(1)
|
|
395
|
+
end
|
|
396
|
+
end
|
|
397
|
+
end
|