wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
@@ -0,0 +1,246 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "spec_helper"
4
+
5
+ RSpec.describe "MediaWiki Data" do
6
+ describe "extension_tags in mediawiki_aliases.json" do
7
+ before(:all) do
8
+ data_path = File.join(__dir__, "..", "lib", "wp2txt", "data", "mediawiki_aliases.json")
9
+ @mediawiki_data = JSON.parse(File.read(data_path))
10
+ end
11
+
12
+ it "contains extension_tags array" do
13
+ expect(@mediawiki_data["extension_tags"]).to be_an(Array)
14
+ end
15
+
16
+ it "contains common extension tags" do
17
+ tags = @mediawiki_data["extension_tags"]
18
+ expect(tags).to include("gallery")
19
+ expect(tags).to include("timeline")
20
+ expect(tags).to include("imagemap")
21
+ expect(tags).to include("math")
22
+ expect(tags).to include("ref")
23
+ expect(tags).to include("syntaxhighlight")
24
+ end
25
+
26
+ it "contains German-specific tag" do
27
+ tags = @mediawiki_data["extension_tags"]
28
+ expect(tags).to include("abschnitt") # German for "section"
29
+ end
30
+ end
31
+ end
32
+
33
+ RSpec.describe "Template Data" do
34
+ let(:data_path) { File.join(__dir__, "..", "lib", "wp2txt", "data", "template_aliases.json") }
35
+
36
+ describe "template_aliases.json" do
37
+ before(:all) do
38
+ @data = if File.exist?(File.join(__dir__, "..", "lib", "wp2txt", "data", "template_aliases.json"))
39
+ JSON.parse(File.read(File.join(__dir__, "..", "lib", "wp2txt", "data", "template_aliases.json")))
40
+ else
41
+ nil
42
+ end
43
+ end
44
+
45
+ it "exists" do
46
+ expect(File.exist?(data_path)).to be true
47
+ end
48
+
49
+ it "has valid JSON structure" do
50
+ expect(@data).to be_a(Hash)
51
+ end
52
+
53
+ it "has meta information" do
54
+ expect(@data["meta"]).to be_a(Hash)
55
+ expect(@data["meta"]["generated_at"]).to be_a(String)
56
+ end
57
+
58
+ describe "remove_templates category" do
59
+ it "exists and is an array" do
60
+ expect(@data["remove_templates"]).to be_an(Array)
61
+ end
62
+
63
+ it "contains known navigation templates" do
64
+ templates = @data["remove_templates"].map(&:downcase)
65
+ # These should be removed in all languages
66
+ expect(templates).to include("reflist")
67
+ expect(templates).to include("refbegin")
68
+ expect(templates).to include("refend")
69
+ expect(templates).to include("notelist")
70
+ end
71
+
72
+ it "contains hatnote templates" do
73
+ templates = @data["remove_templates"].map(&:downcase)
74
+ expect(templates).to include("main")
75
+ expect(templates).to include("see also")
76
+ expect(templates).to include("further")
77
+ expect(templates).to include("about")
78
+ end
79
+ end
80
+
81
+ describe "authority_control category" do
82
+ it "exists and is an array" do
83
+ expect(@data["authority_control"]).to be_an(Array)
84
+ end
85
+
86
+ it "contains English authority control templates" do
87
+ templates = @data["authority_control"].map(&:downcase)
88
+ expect(templates).to include("authority control")
89
+ end
90
+
91
+ it "contains German Normdaten template" do
92
+ templates = @data["authority_control"].map(&:downcase)
93
+ expect(templates).to include("normdaten")
94
+ end
95
+
96
+ it "contains identifier templates" do
97
+ templates = @data["authority_control"].map(&:downcase)
98
+ %w[viaf lccn gnd isni orcid].each do |id|
99
+ expect(templates).to include(id)
100
+ end
101
+ end
102
+ end
103
+
104
+ describe "cleanup_remnants category" do
105
+ it "exists and is an array" do
106
+ expect(@data["cleanup_remnants"]).to be_an(Array)
107
+ end
108
+
109
+ it "contains layout templates" do
110
+ templates = @data["cleanup_remnants"].map(&:downcase)
111
+ expect(templates).to include("clear")
112
+ expect(templates).to include("clearleft")
113
+ expect(templates).to include("clearright")
114
+ end
115
+ end
116
+
117
+ describe "citation_templates category" do
118
+ it "exists and is an array" do
119
+ expect(@data["citation_templates"]).to be_an(Array)
120
+ end
121
+
122
+ it "contains cite templates in multiple languages" do
123
+ templates = @data["citation_templates"].map(&:downcase)
124
+ # English
125
+ expect(templates).to include("cite web")
126
+ expect(templates).to include("cite book")
127
+ expect(templates).to include("citation")
128
+ # Japanese
129
+ expect(templates).to include("cite web") # Same in Japanese Wikipedia
130
+ end
131
+ end
132
+
133
+ describe "ruby_text_templates category" do
134
+ it "exists and is an array" do
135
+ expect(@data["ruby_text_templates"]).to be_an(Array)
136
+ end
137
+
138
+ it "contains Japanese ruby template" do
139
+ templates = @data["ruby_text_templates"]
140
+ expect(templates).to include("読み仮名")
141
+ end
142
+
143
+ it "contains English ruby templates" do
144
+ templates = @data["ruby_text_templates"].map(&:downcase)
145
+ expect(templates).to include("ruby")
146
+ expect(templates).to include("ruby-ja")
147
+ end
148
+ end
149
+
150
+ describe "interwiki_link_templates category" do
151
+ it "exists and is an array" do
152
+ expect(@data["interwiki_link_templates"]).to be_an(Array)
153
+ end
154
+
155
+ it "contains Japanese 仮リンク template" do
156
+ templates = @data["interwiki_link_templates"]
157
+ expect(templates).to include("仮リンク")
158
+ end
159
+
160
+ it "contains interlanguage link templates" do
161
+ templates = @data["interwiki_link_templates"].map(&:downcase)
162
+ expect(templates).to include("ill") # Interlanguage link
163
+ expect(templates).to include("interlanguage link")
164
+ end
165
+ end
166
+
167
+ describe "mixed_script_templates category" do
168
+ it "exists and is an array" do
169
+ expect(@data["mixed_script_templates"]).to be_an(Array)
170
+ end
171
+
172
+ it "contains nihongo template" do
173
+ templates = @data["mixed_script_templates"].map(&:downcase)
174
+ expect(templates).to include("nihongo")
175
+ end
176
+
177
+ it "contains transliteration templates" do
178
+ templates = @data["mixed_script_templates"].map(&:downcase)
179
+ expect(templates).to include("transl")
180
+ end
181
+ end
182
+
183
+ describe "formatting_templates category" do
184
+ it "exists and is an array" do
185
+ expect(@data["formatting_templates"]).to be_an(Array)
186
+ end
187
+
188
+ it "contains size templates" do
189
+ templates = @data["formatting_templates"].map(&:downcase)
190
+ expect(templates).to include("small")
191
+ expect(templates).to include("smaller")
192
+ expect(templates).to include("large")
193
+ expect(templates).to include("larger")
194
+ end
195
+
196
+ it "contains spacing templates" do
197
+ templates = @data["formatting_templates"].map(&:downcase)
198
+ expect(templates).to include("nbsp")
199
+ expect(templates).to include("nowrap")
200
+ end
201
+ end
202
+
203
+ describe "flag_templates category" do
204
+ it "exists and is an array" do
205
+ expect(@data["flag_templates"]).to be_an(Array)
206
+ end
207
+
208
+ it "contains flag template variants" do
209
+ templates = @data["flag_templates"].map(&:downcase)
210
+ expect(templates).to include("flag")
211
+ expect(templates).to include("flagicon")
212
+ expect(templates).to include("flagcountry")
213
+ end
214
+ end
215
+
216
+ describe "portal_templates category" do
217
+ it "exists and is an array" do
218
+ expect(@data["portal_templates"]).to be_an(Array)
219
+ end
220
+
221
+ it "contains portal templates in multiple languages" do
222
+ templates = @data["portal_templates"].map(&:downcase)
223
+ expect(templates).to include("portal")
224
+ # Japanese
225
+ expect(@data["portal_templates"]).to include("ウィキポータルリンク")
226
+ end
227
+ end
228
+
229
+ describe "sister_project_templates category" do
230
+ it "exists and is an array" do
231
+ expect(@data["sister_project_templates"]).to be_an(Array)
232
+ end
233
+
234
+ it "contains commons templates" do
235
+ templates = @data["sister_project_templates"].map(&:downcase)
236
+ expect(templates).to include("commons")
237
+ expect(templates).to include("commons category")
238
+ end
239
+
240
+ it "contains wiktionary templates" do
241
+ templates = @data["sister_project_templates"].map(&:downcase)
242
+ expect(templates).to include("wiktionary")
243
+ end
244
+ end
245
+ end
246
+ end