wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
@@ -0,0 +1,261 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "spec_helper"
4
+
5
+ RSpec.describe Wp2txt::MagicWordExpander do
6
+ let(:title) { "Test Article" }
7
+ let(:namespace) { "" }
8
+ let(:dump_date) { Time.new(2024, 6, 15, 14, 30, 45) }
9
+ let(:expander) { described_class.new(title, namespace: namespace, dump_date: dump_date) }
10
+
11
+ describe "#expand" do
12
+ context "page context magic words" do
13
+ it "expands {{PAGENAME}}" do
14
+ expect(expander.expand("{{PAGENAME}}")).to eq("Test Article")
15
+ end
16
+
17
+ it "expands {{pagename}} (case-insensitive)" do
18
+ expect(expander.expand("{{pagename}}")).to eq("Test Article")
19
+ end
20
+
21
+ it "expands {{PAGENAMEE}} with URL encoding" do
22
+ expect(expander.expand("{{PAGENAMEE}}")).to eq("Test_Article")
23
+ end
24
+
25
+ it "expands {{FULLPAGENAME}} without namespace" do
26
+ expect(expander.expand("{{FULLPAGENAME}}")).to eq("Test Article")
27
+ end
28
+
29
+ context "with namespace" do
30
+ let(:namespace) { "Wikipedia" }
31
+
32
+ it "expands {{FULLPAGENAME}} with namespace" do
33
+ expect(expander.expand("{{FULLPAGENAME}}")).to eq("Wikipedia:Test Article")
34
+ end
35
+
36
+ it "expands {{NAMESPACE}}" do
37
+ expect(expander.expand("{{NAMESPACE}}")).to eq("Wikipedia")
38
+ end
39
+ end
40
+
41
+ context "with subpage title" do
42
+ let(:title) { "Main Page/Subpage/Deep" }
43
+
44
+ it "expands {{BASEPAGENAME}} to parent" do
45
+ expect(expander.expand("{{BASEPAGENAME}}")).to eq("Main Page/Subpage")
46
+ end
47
+
48
+ it "expands {{ROOTPAGENAME}} to root" do
49
+ expect(expander.expand("{{ROOTPAGENAME}}")).to eq("Main Page")
50
+ end
51
+
52
+ it "expands {{SUBPAGENAME}} to last part" do
53
+ expect(expander.expand("{{SUBPAGENAME}}")).to eq("Deep")
54
+ end
55
+ end
56
+
57
+ it "expands {{TALKPAGENAME}}" do
58
+ expect(expander.expand("{{TALKPAGENAME}}")).to eq("Talk:Test Article")
59
+ end
60
+
61
+ it "expands {{NAMESPACENUMBER}} for main namespace" do
62
+ expect(expander.expand("{{NAMESPACENUMBER}}")).to eq("0")
63
+ end
64
+ end
65
+
66
+ context "date/time magic words" do
67
+ it "expands {{CURRENTYEAR}}" do
68
+ expect(expander.expand("{{CURRENTYEAR}}")).to eq("2024")
69
+ end
70
+
71
+ it "expands {{CURRENTMONTH}} with zero padding" do
72
+ expect(expander.expand("{{CURRENTMONTH}}")).to eq("06")
73
+ end
74
+
75
+ it "expands {{CURRENTMONTH1}} without zero padding" do
76
+ expect(expander.expand("{{CURRENTMONTH1}}")).to eq("6")
77
+ end
78
+
79
+ it "expands {{CURRENTMONTHNAME}}" do
80
+ expect(expander.expand("{{CURRENTMONTHNAME}}")).to eq("June")
81
+ end
82
+
83
+ it "expands {{CURRENTMONTHABBREV}}" do
84
+ expect(expander.expand("{{CURRENTMONTHABBREV}}")).to eq("Jun")
85
+ end
86
+
87
+ it "expands {{CURRENTDAY}}" do
88
+ expect(expander.expand("{{CURRENTDAY}}")).to eq("15")
89
+ end
90
+
91
+ it "expands {{CURRENTDAY2}} with zero padding" do
92
+ dump_date_single_digit = Time.new(2024, 6, 5)
93
+ exp = described_class.new(title, dump_date: dump_date_single_digit)
94
+ expect(exp.expand("{{CURRENTDAY2}}")).to eq("05")
95
+ end
96
+
97
+ it "expands {{CURRENTDOW}} (day of week)" do
98
+ # June 15, 2024 is a Saturday (6)
99
+ expect(expander.expand("{{CURRENTDOW}}")).to eq("6")
100
+ end
101
+
102
+ it "expands {{CURRENTDAYNAME}}" do
103
+ expect(expander.expand("{{CURRENTDAYNAME}}")).to eq("Saturday")
104
+ end
105
+
106
+ it "expands {{CURRENTTIME}}" do
107
+ expect(expander.expand("{{CURRENTTIME}}")).to eq("14:30")
108
+ end
109
+
110
+ it "expands {{CURRENTHOUR}}" do
111
+ expect(expander.expand("{{CURRENTHOUR}}")).to eq("14")
112
+ end
113
+
114
+ it "expands {{CURRENTTIMESTAMP}}" do
115
+ expect(expander.expand("{{CURRENTTIMESTAMP}}")).to eq("20240615143045")
116
+ end
117
+
118
+ it "expands {{LOCALYEAR}} (same as CURRENTYEAR)" do
119
+ expect(expander.expand("{{LOCALYEAR}}")).to eq("2024")
120
+ end
121
+ end
122
+
123
+ context "string functions" do
124
+ it "expands {{lc:TEXT}}" do
125
+ expect(expander.expand("{{lc:HELLO WORLD}}")).to eq("hello world")
126
+ end
127
+
128
+ it "expands {{uc:text}}" do
129
+ expect(expander.expand("{{uc:hello world}}")).to eq("HELLO WORLD")
130
+ end
131
+
132
+ it "expands {{lcfirst:TEXT}}" do
133
+ expect(expander.expand("{{lcfirst:HELLO}}")).to eq("hELLO")
134
+ end
135
+
136
+ it "expands {{ucfirst:text}}" do
137
+ expect(expander.expand("{{ucfirst:hello}}")).to eq("Hello")
138
+ end
139
+
140
+ it "expands {{urlencode:...}}" do
141
+ expect(expander.expand("{{urlencode:hello world}}")).to eq("hello_world")
142
+ end
143
+
144
+ it "expands {{anchorencode:...}}" do
145
+ expect(expander.expand("{{anchorencode:hello world}}")).to eq("hello_world")
146
+ end
147
+
148
+ it "expands {{padleft:...}}" do
149
+ expect(expander.expand("{{padleft:7|3|0}}")).to eq("007")
150
+ end
151
+
152
+ it "expands {{padright:...}}" do
153
+ expect(expander.expand("{{padright:7|3|0}}")).to eq("700")
154
+ end
155
+
156
+ it "expands {{formatnum:...}} with thousand separators" do
157
+ expect(expander.expand("{{formatnum:12345}}")).to eq("12,345")
158
+ expect(expander.expand("{{formatnum:1234567}}")).to eq("1,234,567")
159
+ expect(expander.expand("{{formatnum:1234.56}}")).to eq("1,234.56")
160
+ end
161
+
162
+ it "expands {{formatnum:...|R}} to remove formatting" do
163
+ expect(expander.expand("{{formatnum:1,234,567|R}}")).to eq("1234567")
164
+ end
165
+ end
166
+
167
+ context "#titleparts parser function" do
168
+ it "extracts first N segments" do
169
+ expect(expander.expand("{{#titleparts:A/B/C|2}}")).to eq("A/B")
170
+ end
171
+
172
+ it "extracts from offset" do
173
+ expect(expander.expand("{{#titleparts:A/B/C|1|2}}")).to eq("B")
174
+ end
175
+
176
+ it "handles negative count (all but last N)" do
177
+ expect(expander.expand("{{#titleparts:A/B/C/D|-1}}")).to eq("A/B/C")
178
+ end
179
+
180
+ it "returns full path without parameters" do
181
+ expect(expander.expand("{{#titleparts:A/B/C}}")).to eq("A/B/C")
182
+ end
183
+ end
184
+
185
+ context "multiple magic words in one string" do
186
+ it "expands all magic words" do
187
+ input = "Page: {{PAGENAME}}, Year: {{CURRENTYEAR}}, Month: {{CURRENTMONTHNAME}}"
188
+ expected = "Page: Test Article, Year: 2024, Month: June"
189
+ expect(expander.expand(input)).to eq(expected)
190
+ end
191
+
192
+ it "handles mixed case and functions" do
193
+ input = "{{uc:{{PAGENAME}}}} in {{CURRENTYEAR}}"
194
+ # The uc: function uppercases the inner PAGENAME result
195
+ result = expander.expand(input)
196
+ expect(result).to include("TEST ARTICLE")
197
+ expect(result).to include("2024")
198
+ end
199
+ end
200
+
201
+ context "unrecognized magic words" do
202
+ it "leaves unrecognized magic words unchanged" do
203
+ expect(expander.expand("{{UNKNOWNMAGICWORD}}")).to eq("{{UNKNOWNMAGICWORD}}")
204
+ end
205
+
206
+ it "leaves template calls unchanged" do
207
+ expect(expander.expand("{{Infobox|name=test}}")).to eq("{{Infobox|name=test}}")
208
+ end
209
+ end
210
+
211
+ context "edge cases" do
212
+ it "handles nil input" do
213
+ expect(expander.expand(nil)).to eq(nil)
214
+ end
215
+
216
+ it "handles empty string" do
217
+ expect(expander.expand("")).to eq("")
218
+ end
219
+
220
+ it "handles text without magic words" do
221
+ expect(expander.expand("Hello World")).to eq("Hello World")
222
+ end
223
+
224
+ it "handles magic words with extra whitespace" do
225
+ expect(expander.expand("{{ PAGENAME }}")).to eq("Test Article")
226
+ end
227
+ end
228
+ end
229
+
230
+ describe "integration with format_wiki" do
231
+ include Wp2txt
232
+
233
+ it "expands magic words when title is provided in config" do
234
+ input = "This article is about {{PAGENAME}}."
235
+ result = format_wiki(input, title: "Ruby Programming")
236
+ expect(result).to include("Ruby Programming")
237
+ expect(result).not_to include("{{PAGENAME}}")
238
+ end
239
+
240
+ it "does not expand magic words without title in config" do
241
+ input = "This article is about {{PAGENAME}}."
242
+ result = format_wiki(input)
243
+ # Without title, the magic word might be removed as template or left as-is
244
+ # The important thing is it doesn't crash
245
+ expect(result).to be_a(String)
246
+ end
247
+
248
+ it "expands date magic words with current time when dump_date not specified" do
249
+ input = "Year: {{CURRENTYEAR}}"
250
+ result = format_wiki(input, title: "Test")
251
+ expect(result).to match(/Year: \d{4}/)
252
+ end
253
+
254
+ it "expands string functions" do
255
+ input = "{{uc:hello}} {{lc:WORLD}}"
256
+ result = format_wiki(input, title: "Test")
257
+ expect(result).to include("HELLO")
258
+ expect(result).to include("world")
259
+ end
260
+ end
261
+ end