wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
@@ -0,0 +1,337 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "constants"
4
+ require_relative "regex"
5
+
6
+ module Wp2txt
7
+ # Text processing utilities: character conversion, nested structure handling, cleanup
8
+
9
+ # Cache for dynamically generated regex patterns
10
+ @regex_cache = {}
11
+ class << self
12
+ attr_accessor :regex_cache
13
+ end
14
+
15
+ def convert_characters(text, _has_retried = false)
16
+ # Use scrub to safely handle invalid byte sequences
17
+ text = text.to_s.scrub("")
18
+ text = chrref_to_utf(text)
19
+ text = special_chr(text)
20
+ text.encode("UTF-8", "UTF-8", invalid: :replace, replace: "")
21
+ rescue ::Encoding::InvalidByteSequenceError, ::Encoding::UndefinedConversionError, ArgumentError
22
+ # If any encoding error persists, scrub again and return
23
+ text.to_s.scrub("")
24
+ end
25
+
26
+ # Get HTML decoder instance (thread-local for Ractor compatibility)
27
+ def html_decoder
28
+ Thread.current[:wp2txt_html_decoder] ||= HTMLEntities.new
29
+ end
30
+
31
+ def special_chr(str)
32
+ result = html_decoder.decode(str)
33
+ # Decode additional mathematical entities not covered by HTMLEntities gem
34
+ result.gsub!(MATH_ENTITIES_REGEX) { MATH_ENTITIES[$1] }
35
+ result
36
+ rescue RangeError
37
+ # RangeError: character code out of range (e.g., invalid numeric entity like &#1550315;)
38
+ # Remove invalid numeric entities and try again
39
+ cleaned = str.gsub(/&#(\d+);/) do |match|
40
+ codepoint = $1.to_i
41
+ codepoint <= 0x10FFFF ? match : ""
42
+ end
43
+ cleaned.gsub(/&#x([0-9a-fA-F]+);/) do |match|
44
+ codepoint = $1.to_i(16)
45
+ codepoint <= 0x10FFFF ? match : ""
46
+ end
47
+ end
48
+
49
+ def chrref_to_utf(num_str)
50
+ num_str.gsub(CHRREF_TO_UTF_REGEX) do
51
+ codepoint = $1 == "x" ? $2.to_i(16) : $2.to_i
52
+ # Handle all valid Unicode codepoints (U+0001 to U+10FFFF)
53
+ if codepoint > 0 && codepoint <= 0x10FFFF
54
+ [codepoint].pack("U")
55
+ else
56
+ ""
57
+ end
58
+ end
59
+ rescue RangeError, ArgumentError
60
+ # RangeError: invalid codepoint, ArgumentError: pack error
61
+ num_str
62
+ end
63
+
64
+ def mndash(str)
65
+ str.gsub(MNDASH_REGEX, "–")
66
+ end
67
+
68
+ #################### parser for nested structure ####################
69
+
70
+ # Optimized single-pass nested structure processor
71
+ # Processes innermost brackets first, avoiding recursion overhead
72
+ def process_nested_structure(scanner, left, right, &block)
73
+ str = scanner.is_a?(StringScanner) ? scanner.string : scanner.to_s
74
+ process_nested_single_pass(str, left, right, &block)
75
+ end
76
+
77
+ # Single-pass iterative processor - finds and processes innermost brackets first
78
+ # This avoids the overhead of recursive calls and repeated string scanning
79
+ def process_nested_single_pass(str, left, right, &block)
80
+ return str unless str.include?(left)
81
+
82
+ result = +str
83
+ left_len = left.length
84
+ right_len = right.length
85
+ max_iterations = MAX_NESTING_ITERATIONS
86
+
87
+ iterations = 0
88
+ loop do
89
+ iterations += 1
90
+ break if iterations > max_iterations
91
+
92
+ pos = 0
93
+ found = false
94
+
95
+ while pos < result.length
96
+ # Find next left bracket
97
+ left_pos = result.index(left, pos)
98
+ break unless left_pos
99
+
100
+ # Look for nested left bracket and matching right bracket
101
+ inner_left = result.index(left, left_pos + left_len)
102
+ right_pos = result.index(right, left_pos + left_len)
103
+
104
+ break unless right_pos
105
+
106
+ # If there's a nested left bracket before the right, skip to process inner first
107
+ if inner_left && inner_left < right_pos
108
+ pos = inner_left
109
+ next
110
+ end
111
+
112
+ # Found innermost pair - process it
113
+ content = result[(left_pos + left_len)...right_pos]
114
+ processed = yield content
115
+ result = result[0...left_pos] + processed + result[(right_pos + right_len)..]
116
+ found = true
117
+ break
118
+ end
119
+
120
+ break unless found
121
+ end
122
+
123
+ result
124
+ rescue RegexpError, ArgumentError, SystemStackError
125
+ # RegexpError: malformed pattern, ArgumentError: invalid argument
126
+ # SystemStackError: stack overflow from deeply nested content
127
+ str
128
+ end
129
+
130
+ #################### nowiki handling ####################
131
+
132
+ def escape_nowiki(str)
133
+ if @nowikis
134
+ @nowikis.clear
135
+ else
136
+ @nowikis = {}
137
+ end
138
+ str.gsub(ESCAPE_NOWIKI_REGEX) do
139
+ nowiki = $1
140
+ nowiki_id = nowiki.object_id
141
+ @nowikis[nowiki_id] = nowiki
142
+ "<nowiki-#{nowiki_id}>"
143
+ end
144
+ end
145
+
146
+ def unescape_nowiki(str)
147
+ str.gsub(UNESCAPE_NOWIKI_REGEX) do
148
+ obj_id = $1.to_i
149
+ @nowikis[obj_id]
150
+ end
151
+ end
152
+
153
+ #################### cleanup and removal methods ####################
154
+
155
+ def cleanup(text)
156
+ # Work with a mutable copy to reduce intermediate string allocations
157
+ result = +text.to_s
158
+ result.gsub!(CLEANUP_REGEX_01, "")
159
+ result.gsub!(CLEANUP_REGEX_02, "")
160
+ result.gsub!(CLEANUP_REGEX_03, "")
161
+ result.gsub!(CLEANUP_REGEX_04, "")
162
+ result.gsub!(CLEANUP_REGEX_05, "")
163
+ result.gsub!(CLEANUP_REGEX_06, "")
164
+ result.gsub!(CLEANUP_REGEX_07, "")
165
+ # Reduce 3+ consecutive newlines to 2
166
+ result.gsub!(CLEANUP_REGEX_08, "\n\n")
167
+ # Also handle mixed whitespace patterns (spaces/tabs between newlines)
168
+ result.gsub!(CLEANUP_MIXED_WHITESPACE_REGEX, "\n\n")
169
+
170
+ # Fix 1: Multiple consecutive spaces → single space (but preserve indentation at line start)
171
+ result.gsub!(CLEANUP_MULTIPLE_SPACES_REGEX, '\1 ')
172
+
173
+ # Fix 2: Empty parentheses → remove (both ASCII and Japanese)
174
+ result.gsub!(CLEANUP_EMPTY_PARENS_REGEX, "")
175
+
176
+ # Fix 3: Leftover pipe characters (table/infobox remnants)
177
+ result.gsub!(CLEANUP_MULTIPLE_PIPES_REGEX, "")
178
+ result.gsub!(CLEANUP_TRAILING_PIPE_REGEX, "")
179
+ result.gsub!(CLEANUP_PIPE_LINE_REGEX, "")
180
+ # Lines with multiple pipe-separated key=value pairs (infobox remnants)
181
+ result.gsub!(CLEANUP_KEY_VALUE_LINE_REGEX, "")
182
+ # Template name remnants (data-driven from template_aliases.json)
183
+ result.gsub!(CLEANUP_REMNANTS_REGEX, "")
184
+ # Imagemap/gallery remnants: lines like "Image:file.jpg|thumb|...|caption" without [[ brackets
185
+ result.gsub!(CLEANUP_FILE_LINE_REGEX, "")
186
+ # Incomplete File/Image links (opened but not closed on same logical unit)
187
+ result.gsub!(CLEANUP_FILE_INCOMPLETE_REGEX, "")
188
+ # Orphaned closing brackets from split File links (e.g., "caption]] rest of text")
189
+ result.gsub!(CLEANUP_ORPHANED_CLOSE_REGEX, '\1')
190
+ # Orphaned opening brackets and standalone ]] lines (combined for single pass)
191
+ result.gsub!(CLEANUP_ORPHANED_BRACKETS_REGEX, "")
192
+ # ]] preceded by pipe without matching [[ (orphaned from broken links)
193
+ result.gsub!(CLEANUP_PIPE_CLOSE_REGEX) { "#{$1}#{$2}" }
194
+
195
+ # =========================================================================
196
+ # Multilingual cleanup (language-agnostic patterns)
197
+ # =========================================================================
198
+
199
+ # MediaWiki magic words: DEFAULTSORT:..., DISPLAYTITLE:...
200
+ # Handles both bare format (DEFAULTSORT:value) and template format ({{DEFAULTSORT:value}})
201
+ result.gsub!(MAGIC_WORD_TEMPLATE_REGEX, "")
202
+ result.gsub!(MAGIC_WORD_LINE_REGEX, "")
203
+
204
+ # Double-underscore magic words: __NOTOC__, __TOC__, __FORCETOC__, etc.
205
+ result.gsub!(DOUBLE_UNDERSCORE_MAGIC_REGEX, "")
206
+
207
+ # Interwiki links: :en:Article → Article (keep article name, remove prefix)
208
+ result.gsub!(INTERWIKI_PREFIX_REGEX, "")
209
+
210
+ # Authority control templates: Normdaten, Authority control, Persondata, etc.
211
+ result.gsub!(AUTHORITY_CONTROL_REGEX, "")
212
+
213
+ # Category lines in various languages (but NOT "CATEGORIES:" summary line)
214
+ result.gsub!(CATEGORY_LINE_REGEX, "")
215
+
216
+ # Wikimedia sister project markers: Wikibooks, Commons, School:..., etc.
217
+ result.gsub!(WIKIMEDIA_PROJECT_REGEX, "")
218
+
219
+ # Lone asterisk lines (list markers without content)
220
+ result.gsub!(LONE_ASTERISK_REGEX, "")
221
+
222
+ # Final cleanup: reduce multiple blank lines again after all removals
223
+ result.gsub!(CLEANUP_MULTI_BLANK_REGEX, "\n\n")
224
+
225
+ result.strip!
226
+ result << "\n\n"
227
+ end
228
+
229
+ # Extension tags to remove (block-level tags that should be stripped)
230
+ # Data source: mediawiki_aliases.json (extension_tags)
231
+ # These are MediaWiki extension tags like <gallery>, <timeline>, <imagemap>, etc.
232
+ EXTENSION_TAGS = Wp2txt.load_mediawiki_data["extension_tags"] || []
233
+
234
+ # Block-level extension tags to process in remove_html
235
+ # Not all extension tags should be removed here - some are handled by markers (math, chem, etc.)
236
+ # and some are inline (ref). We only remove block-level content containers.
237
+ BLOCK_EXTENSION_TAGS = %w[div gallery timeline noinclude imagemap poem hiero graph categorytree section].freeze
238
+
239
+ def remove_html(str)
240
+ res = +str.to_s
241
+ # Remove HTML comments first (before other processing to avoid [ref] in comments issue)
242
+ res.gsub!(HTML_COMMENT_REGEX, "")
243
+ res.gsub!(SELF_CLOSING_TAG_REGEX, "")
244
+
245
+ # Use data-driven extension tags, filtered to block-level only
246
+ # Combine BLOCK_EXTENSION_TAGS with extension_tags from data for comprehensive coverage
247
+ tags_to_remove = (BLOCK_EXTENSION_TAGS + EXTENSION_TAGS.select { |t|
248
+ # Include additional block-level tags from data
249
+ %w[div gallery timeline noinclude imagemap poem hiero graph categorytree section abschnitt].include?(t)
250
+ }).uniq
251
+
252
+ tags_to_remove.each do |tag|
253
+ # Early exit if tag not present
254
+ next unless res.include?("<#{tag}")
255
+ result = process_nested_single_pass(res, "<#{tag}", "#{tag}>") { "" }
256
+ res.replace(result)
257
+ end
258
+ # Remove imagemap coordinate remnants (rect, poly, circle, default with coordinates)
259
+ res.gsub!(IMAGEMAP_COORD_REGEX, "")
260
+ res
261
+ end
262
+
263
+ def remove_complex(str)
264
+ # Work with a mutable copy to reduce intermediate string allocations
265
+ result = +str.to_s
266
+ result.gsub!(COMPLEX_REGEX_01) { "《#{$1}》" }
267
+ result.gsub!(COMPLEX_REGEX_02, "")
268
+ result.gsub!(COMPLEX_REGEX_03, "")
269
+ result.gsub!(COMPLEX_REGEX_04, "")
270
+ result.gsub!(COMPLEX_REGEX_05, "")
271
+ result
272
+ end
273
+
274
+ def remove_inbetween(str, tagset = ["<", ">"])
275
+ # Use cached regex for common tagsets
276
+ cache_key = "inbetween:#{tagset.join}"
277
+ regex = Wp2txt.regex_cache[cache_key] ||= begin
278
+ tagsets = Regexp.quote(tagset.uniq.join(""))
279
+ Regexp.new("#{Regexp.escape(tagset[0])}[^#{tagsets}]*#{Regexp.escape(tagset[1])}")
280
+ end
281
+ str.gsub(regex, "")
282
+ end
283
+
284
+ def remove_tag(str)
285
+ str.gsub(REMOVE_TAG_REGEX, "")
286
+ end
287
+
288
+ def remove_directive(str)
289
+ str.gsub(REMOVE_DIRECTIVES_REGEX, "")
290
+ end
291
+
292
+ def remove_emphasis(str)
293
+ str.gsub(REMOVE_EMPHASIS_REGEX) do
294
+ $2
295
+ end
296
+ end
297
+
298
+ def remove_hr(str)
299
+ str.gsub(REMOVE_HR_REGEX, "")
300
+ end
301
+
302
+ def remove_ref(str)
303
+ str.gsub(FORMAT_REF_REGEX) { "" }
304
+ end
305
+
306
+ def make_reference(str)
307
+ # Work with a mutable copy to reduce intermediate string allocations
308
+ result = +str.to_s
309
+ result.gsub!(MAKE_REFERENCE_REGEX_A, "\n")
310
+ result.gsub!(MAKE_REFERENCE_REGEX_B, "")
311
+ result.gsub!(MAKE_REFERENCE_REGEX_C, "[ref]")
312
+ result.gsub!(MAKE_REFERENCE_REGEX_D, "[/ref]")
313
+ result
314
+ end
315
+
316
+ # =========================================================================
317
+ # Make constants Ractor-shareable for parallel processing
318
+ # =========================================================================
319
+ module_function
320
+
321
+ def self.make_constants_ractor_shareable!
322
+ return unless defined?(Ractor) && Ractor.respond_to?(:make_shareable)
323
+
324
+ constants(false).each do |const_name|
325
+ const = const_get(const_name)
326
+ next if Ractor.shareable?(const)
327
+
328
+ begin
329
+ Ractor.make_shareable(const)
330
+ rescue Ractor::IsolationError, FrozenError, TypeError
331
+ # Some constants can't be made shareable, skip them
332
+ end
333
+ end
334
+ end
335
+
336
+ make_constants_ractor_shareable!
337
+ end