wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
@@ -0,0 +1,258 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "sqlite3"
4
+ require "fileutils"
5
+ require "digest"
6
+
7
+ module Wp2txt
8
+ # SQLite-based cache for multistream index
9
+ # Dramatically speeds up repeated index access by storing parsed entries in SQLite
10
+ class IndexCache
11
+ CACHE_VERSION = 1
12
+ CACHE_SUFFIX = ".sqlite3"
13
+
14
+ attr_reader :cache_path, :source_path
15
+
16
+ def initialize(source_path, cache_dir: nil)
17
+ @source_path = source_path
18
+ @cache_dir = cache_dir || default_cache_dir
19
+ @cache_path = build_cache_path
20
+ @db = nil
21
+ end
22
+
23
+ # Check if cache exists and is valid for the current source file
24
+ # @return [Boolean] true if cache is usable
25
+ def valid?
26
+ return false unless File.exist?(@cache_path)
27
+ return false unless File.exist?(@source_path)
28
+
29
+ begin
30
+ open_db
31
+ meta = load_metadata
32
+ return false unless meta
33
+
34
+ # Check cache version
35
+ return false if meta[:cache_version].to_i != CACHE_VERSION
36
+
37
+ # Check source file hasn't changed
38
+ source_stat = File.stat(@source_path)
39
+ return false if meta[:source_mtime].to_i != source_stat.mtime.to_i
40
+ return false if meta[:source_size].to_i != source_stat.size
41
+
42
+ true
43
+ rescue SQLite3::Exception
44
+ false
45
+ ensure
46
+ close_db
47
+ end
48
+ end
49
+
50
+ # Load index entries from cache
51
+ # @return [Hash] { entries_by_title: {}, entries_by_id: {}, stream_offsets: [] }
52
+ def load
53
+ return nil unless valid?
54
+
55
+ entries_by_title = {}
56
+ entries_by_id = {}
57
+ stream_offsets = []
58
+
59
+ open_db
60
+ begin
61
+ # Load all entries
62
+ @db.execute("SELECT title, page_id, byte_offset FROM index_entries") do |row|
63
+ title, page_id, offset = row
64
+ entry = { offset: offset, page_id: page_id, title: title }
65
+ entries_by_title[title] = entry
66
+ entries_by_id[page_id] = entry
67
+ end
68
+
69
+ # Load stream offsets
70
+ @db.execute("SELECT byte_offset FROM stream_offsets ORDER BY byte_offset") do |row|
71
+ stream_offsets << row[0]
72
+ end
73
+
74
+ { entries_by_title: entries_by_title, entries_by_id: entries_by_id, stream_offsets: stream_offsets }
75
+ ensure
76
+ close_db
77
+ end
78
+ end
79
+
80
+ # Save index entries to cache
81
+ # @param entries_by_title [Hash] title => entry hash
82
+ # @param stream_offsets [Array<Integer>] sorted stream offsets
83
+ def save(entries_by_title, stream_offsets)
84
+ FileUtils.mkdir_p(File.dirname(@cache_path))
85
+
86
+ # Remove old cache if exists
87
+ FileUtils.rm_f(@cache_path)
88
+
89
+ open_db
90
+ begin
91
+ create_schema
92
+
93
+ # Use transaction for better performance
94
+ @db.execute("BEGIN TRANSACTION")
95
+
96
+ # Save metadata
97
+ source_stat = File.stat(@source_path)
98
+ save_metadata(
99
+ source_path: @source_path,
100
+ source_mtime: source_stat.mtime.to_i,
101
+ source_size: source_stat.size,
102
+ cache_version: CACHE_VERSION,
103
+ entry_count: entries_by_title.size
104
+ )
105
+
106
+ # Save entries in batches for performance
107
+ stmt = @db.prepare("INSERT INTO index_entries (title, page_id, byte_offset) VALUES (?, ?, ?)")
108
+ entries_by_title.each do |title, entry|
109
+ stmt.execute([title, entry[:page_id], entry[:offset]])
110
+ end
111
+ stmt.close
112
+
113
+ # Save stream offsets
114
+ stmt = @db.prepare("INSERT INTO stream_offsets (byte_offset) VALUES (?)")
115
+ stream_offsets.each do |offset|
116
+ stmt.execute([offset])
117
+ end
118
+ stmt.close
119
+
120
+ @db.execute("COMMIT")
121
+
122
+ true
123
+ rescue SQLite3::Exception => e
124
+ @db.execute("ROLLBACK") rescue nil
125
+ FileUtils.rm_f(@cache_path)
126
+ raise e
127
+ ensure
128
+ close_db
129
+ end
130
+ end
131
+
132
+ # Find entries by titles (efficient batch lookup)
133
+ # @param titles [Array<String>] titles to look up
134
+ # @return [Hash] title => entry or nil
135
+ def find_by_titles(titles)
136
+ return {} if titles.empty?
137
+ return {} unless valid?
138
+
139
+ results = {}
140
+ open_db
141
+ begin
142
+ # Use IN clause with placeholders for batch lookup
143
+ placeholders = titles.map { "?" }.join(",")
144
+ sql = "SELECT title, page_id, byte_offset FROM index_entries WHERE title IN (#{placeholders})"
145
+
146
+ # SQLite3 2.x requires bind variables as an array
147
+ @db.execute(sql, titles) do |row|
148
+ title, page_id, offset = row
149
+ results[title] = { offset: offset, page_id: page_id, title: title }
150
+ end
151
+
152
+ results
153
+ ensure
154
+ close_db
155
+ end
156
+ end
157
+
158
+ # Get cache statistics
159
+ def stats
160
+ return nil unless File.exist?(@cache_path)
161
+
162
+ open_db
163
+ begin
164
+ meta = load_metadata
165
+ entry_count = @db.get_first_value("SELECT COUNT(*) FROM index_entries")
166
+ stream_count = @db.get_first_value("SELECT COUNT(*) FROM stream_offsets")
167
+
168
+ {
169
+ cache_path: @cache_path,
170
+ cache_size: File.size(@cache_path),
171
+ entry_count: entry_count,
172
+ stream_count: stream_count,
173
+ source_path: meta[:source_path],
174
+ source_mtime: meta[:source_mtime] ? Time.at(meta[:source_mtime].to_i) : nil,
175
+ cache_version: meta[:cache_version]
176
+ }
177
+ ensure
178
+ close_db
179
+ end
180
+ end
181
+
182
+ # Delete cache file
183
+ def clear!
184
+ FileUtils.rm_f(@cache_path)
185
+ end
186
+
187
+ private
188
+
189
+ def default_cache_dir
190
+ File.expand_path("~/.wp2txt/cache")
191
+ end
192
+
193
+ def build_cache_path
194
+ # Use source file basename + hash of full path for uniqueness
195
+ basename = File.basename(@source_path, ".*").sub(/-index$/, "")
196
+ path_hash = Digest::MD5.hexdigest(@source_path)[0, 8]
197
+ File.join(@cache_dir, "#{basename}_#{path_hash}#{CACHE_SUFFIX}")
198
+ end
199
+
200
+ def open_db
201
+ @db ||= SQLite3::Database.new(@cache_path)
202
+ # Performance optimizations
203
+ @db.execute("PRAGMA journal_mode = WAL")
204
+ @db.execute("PRAGMA synchronous = NORMAL")
205
+ @db.execute("PRAGMA cache_size = -64000") # 64MB cache
206
+ end
207
+
208
+ def close_db
209
+ @db&.close
210
+ @db = nil
211
+ end
212
+
213
+ def create_schema
214
+ @db.execute(<<~SQL)
215
+ CREATE TABLE IF NOT EXISTS metadata (
216
+ key TEXT PRIMARY KEY,
217
+ value TEXT
218
+ )
219
+ SQL
220
+
221
+ @db.execute(<<~SQL)
222
+ CREATE TABLE IF NOT EXISTS index_entries (
223
+ title TEXT PRIMARY KEY,
224
+ page_id INTEGER,
225
+ byte_offset INTEGER
226
+ )
227
+ SQL
228
+
229
+ @db.execute("CREATE INDEX IF NOT EXISTS idx_page_id ON index_entries(page_id)")
230
+ @db.execute("CREATE INDEX IF NOT EXISTS idx_byte_offset ON index_entries(byte_offset)")
231
+
232
+ @db.execute(<<~SQL)
233
+ CREATE TABLE IF NOT EXISTS stream_offsets (
234
+ byte_offset INTEGER PRIMARY KEY
235
+ )
236
+ SQL
237
+ end
238
+
239
+ def save_metadata(hash)
240
+ stmt = @db.prepare("INSERT OR REPLACE INTO metadata (key, value) VALUES (?, ?)")
241
+ hash.each do |key, value|
242
+ stmt.execute([key.to_s, value.to_s])
243
+ end
244
+ stmt.close
245
+ end
246
+
247
+ def load_metadata
248
+ result = {}
249
+ @db.execute("SELECT key, value FROM metadata") do |row|
250
+ key, value = row
251
+ result[key.to_sym] = value
252
+ end
253
+ result
254
+ rescue SQLite3::Exception
255
+ nil
256
+ end
257
+ end
258
+ end
@@ -0,0 +1,353 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "regex"
4
+
5
+ module Wp2txt
6
+ # Expands MediaWiki magic words to their actual values
7
+ # Supports: page context variables, date/time variables, string functions
8
+ class MagicWordExpander
9
+ # Page context magic words (case-insensitive)
10
+ PAGE_CONTEXT_WORDS = {
11
+ "PAGENAME" => :pagename,
12
+ "PAGENAMEE" => :pagename_encoded,
13
+ "FULLPAGENAME" => :fullpagename,
14
+ "FULLPAGENAMEE" => :fullpagename_encoded,
15
+ "BASEPAGENAME" => :basepagename,
16
+ "BASEPAGENAMEE" => :basepagename_encoded,
17
+ "ROOTPAGENAME" => :rootpagename,
18
+ "ROOTPAGENAMEE" => :rootpagename_encoded,
19
+ "SUBPAGENAME" => :subpagename,
20
+ "SUBPAGENAMEE" => :subpagename_encoded,
21
+ "TALKPAGENAME" => :talkpagename,
22
+ "TALKPAGENAMEE" => :talkpagename_encoded,
23
+ "SUBJECTPAGENAME" => :subjectpagename,
24
+ "SUBJECTPAGENAMEE" => :subjectpagename_encoded,
25
+ "ARTICLEPAGENAME" => :subjectpagename,
26
+ "ARTICLEPAGENAMEE" => :subjectpagename_encoded,
27
+ "NAMESPACE" => :namespace,
28
+ "NAMESPACEE" => :namespace_encoded,
29
+ "NAMESPACENUMBER" => :namespace_number,
30
+ "TALKSPACE" => :talkspace,
31
+ "TALKSPACEE" => :talkspace_encoded,
32
+ "SUBJECTSPACE" => :subjectspace,
33
+ "SUBJECTSPACEE" => :subjectspace_encoded,
34
+ "ARTICLESPACE" => :subjectspace,
35
+ "ARTICLESPACEE" => :subjectspace_encoded
36
+ }.freeze
37
+
38
+ # Date/time magic words
39
+ DATETIME_WORDS = {
40
+ "CURRENTYEAR" => :current_year,
41
+ "CURRENTMONTH" => :current_month,
42
+ "CURRENTMONTH1" => :current_month1,
43
+ "CURRENTMONTHNAME" => :current_month_name,
44
+ "CURRENTMONTHNAMEGEN" => :current_month_name,
45
+ "CURRENTMONTHABBREV" => :current_month_abbrev,
46
+ "CURRENTDAY" => :current_day,
47
+ "CURRENTDAY2" => :current_day2,
48
+ "CURRENTDOW" => :current_dow,
49
+ "CURRENTDAYNAME" => :current_day_name,
50
+ "CURRENTTIME" => :current_time,
51
+ "CURRENTHOUR" => :current_hour,
52
+ "CURRENTWEEK" => :current_week,
53
+ "CURRENTTIMESTAMP" => :current_timestamp,
54
+ # Local variants (same as current for our purposes)
55
+ "LOCALYEAR" => :current_year,
56
+ "LOCALMONTH" => :current_month,
57
+ "LOCALMONTH1" => :current_month1,
58
+ "LOCALMONTHNAME" => :current_month_name,
59
+ "LOCALMONTHNAMEGEN" => :current_month_name,
60
+ "LOCALMONTHABBREV" => :current_month_abbrev,
61
+ "LOCALDAY" => :current_day,
62
+ "LOCALDAY2" => :current_day2,
63
+ "LOCALDOW" => :current_dow,
64
+ "LOCALDAYNAME" => :current_day_name,
65
+ "LOCALTIME" => :current_time,
66
+ "LOCALHOUR" => :current_hour,
67
+ "LOCALWEEK" => :current_week,
68
+ "LOCALTIMESTAMP" => :current_timestamp
69
+ }.freeze
70
+
71
+ # String function magic words (with arguments)
72
+ STRING_FUNCTIONS = %w[
73
+ lc uc lcfirst ucfirst
74
+ padleft padright
75
+ anchorencode urlencode
76
+ plural grammar gender
77
+ int formatnum
78
+ ].freeze
79
+
80
+ # Month names for expansion
81
+ MONTH_NAMES = %w[
82
+ January February March April May June
83
+ July August September October November December
84
+ ].freeze
85
+
86
+ MONTH_ABBREVS = %w[
87
+ Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
88
+ ].freeze
89
+
90
+ DAY_NAMES = %w[
91
+ Sunday Monday Tuesday Wednesday Thursday Friday Saturday
92
+ ].freeze
93
+
94
+ def initialize(title, namespace: "", dump_date: nil)
95
+ @title = title || ""
96
+ @namespace = namespace || ""
97
+ @dump_date = dump_date || Time.now
98
+ end
99
+
100
+ # Main expansion method - expands all supported magic words in text
101
+ def expand(text)
102
+ return text if text.nil? || text.empty?
103
+
104
+ # Early exit: no templates to process
105
+ return text unless text.include?("{{")
106
+
107
+ result = text.dup
108
+
109
+ # Expand simple magic words: {{PAGENAME}}, {{CURRENTYEAR}}, etc.
110
+ result = expand_simple_magic_words(result)
111
+
112
+ # Expand string functions: {{lc:Text}}, {{uc:Text}}, etc.
113
+ result = expand_string_functions(result)
114
+
115
+ # Expand #titleparts parser function
116
+ result = expand_titleparts(result)
117
+
118
+ result
119
+ end
120
+
121
+ private
122
+
123
+ # Expand simple magic words without arguments
124
+ def expand_simple_magic_words(text)
125
+ # Match {{WORD}} pattern (case-insensitive for the word)
126
+ text.gsub(/\{\{\s*([A-Z][A-Z0-9]*)\s*\}\}/i) do |match|
127
+ word = $1.upcase
128
+ if PAGE_CONTEXT_WORDS.key?(word)
129
+ expand_page_context(PAGE_CONTEXT_WORDS[word])
130
+ elsif DATETIME_WORDS.key?(word)
131
+ expand_datetime(DATETIME_WORDS[word])
132
+ else
133
+ match # Return unchanged if not recognized
134
+ end
135
+ end
136
+ end
137
+
138
+ # Expand page context magic words
139
+ def expand_page_context(type)
140
+ case type
141
+ when :pagename
142
+ @title
143
+ when :pagename_encoded
144
+ url_encode(@title)
145
+ when :fullpagename
146
+ @namespace.empty? ? @title : "#{@namespace}:#{@title}"
147
+ when :fullpagename_encoded
148
+ url_encode(@namespace.empty? ? @title : "#{@namespace}:#{@title}")
149
+ when :basepagename
150
+ # Remove subpage part (after last /)
151
+ @title.sub(%r{/[^/]*$}, "")
152
+ when :basepagename_encoded
153
+ url_encode(@title.sub(%r{/[^/]*$}, ""))
154
+ when :rootpagename
155
+ # Get root page (before first /)
156
+ @title.split("/").first || @title
157
+ when :rootpagename_encoded
158
+ url_encode(@title.split("/").first || @title)
159
+ when :subpagename
160
+ # Get subpage part (after last /)
161
+ @title.include?("/") ? @title.split("/").last : @title
162
+ when :subpagename_encoded
163
+ part = @title.include?("/") ? @title.split("/").last : @title
164
+ url_encode(part)
165
+ when :talkpagename
166
+ ns = @namespace.empty? ? "Talk" : "#{@namespace} talk"
167
+ "#{ns}:#{@title}"
168
+ when :talkpagename_encoded
169
+ ns = @namespace.empty? ? "Talk" : "#{@namespace}_talk"
170
+ url_encode("#{ns}:#{@title}")
171
+ when :subjectpagename
172
+ @namespace.empty? ? @title : "#{@namespace}:#{@title}"
173
+ when :subjectpagename_encoded
174
+ url_encode(@namespace.empty? ? @title : "#{@namespace}:#{@title}")
175
+ when :namespace
176
+ @namespace
177
+ when :namespace_encoded
178
+ url_encode(@namespace)
179
+ when :namespace_number
180
+ # Main namespace = 0, others would need a lookup table
181
+ @namespace.empty? ? "0" : ""
182
+ when :talkspace
183
+ @namespace.empty? ? "Talk" : "#{@namespace} talk"
184
+ when :talkspace_encoded
185
+ url_encode(@namespace.empty? ? "Talk" : "#{@namespace} talk")
186
+ when :subjectspace
187
+ @namespace
188
+ when :subjectspace_encoded
189
+ url_encode(@namespace)
190
+ else
191
+ ""
192
+ end
193
+ end
194
+
195
+ # Expand date/time magic words
196
+ def expand_datetime(type)
197
+ case type
198
+ when :current_year
199
+ @dump_date.year.to_s
200
+ when :current_month
201
+ @dump_date.month.to_s.rjust(2, "0")
202
+ when :current_month1
203
+ @dump_date.month.to_s
204
+ when :current_month_name
205
+ MONTH_NAMES[@dump_date.month - 1]
206
+ when :current_month_abbrev
207
+ MONTH_ABBREVS[@dump_date.month - 1]
208
+ when :current_day
209
+ @dump_date.day.to_s
210
+ when :current_day2
211
+ @dump_date.day.to_s.rjust(2, "0")
212
+ when :current_dow
213
+ @dump_date.wday.to_s
214
+ when :current_day_name
215
+ DAY_NAMES[@dump_date.wday]
216
+ when :current_time
217
+ @dump_date.strftime("%H:%M")
218
+ when :current_hour
219
+ @dump_date.hour.to_s.rjust(2, "0")
220
+ when :current_week
221
+ @dump_date.strftime("%V")
222
+ when :current_timestamp
223
+ @dump_date.strftime("%Y%m%d%H%M%S")
224
+ else
225
+ ""
226
+ end
227
+ end
228
+
229
+ # Expand string functions: {{lc:Text}}, {{uc:Text}}, etc.
230
+ def expand_string_functions(text)
231
+ # Match {{function:argument}} pattern
232
+ # Need to handle nested braces carefully
233
+ result = text.dup
234
+
235
+ # Simple string case functions (no nesting issues)
236
+ result.gsub!(/\{\{\s*lc\s*:\s*([^}]*)\}\}/i) { $1.downcase }
237
+ result.gsub!(/\{\{\s*uc\s*:\s*([^}]*)\}\}/i) { $1.upcase }
238
+ result.gsub!(/\{\{\s*lcfirst\s*:\s*([^}]*)\}\}/i) do
239
+ s = $1
240
+ s.empty? ? s : s[0].downcase + s[1..]
241
+ end
242
+ result.gsub!(/\{\{\s*ucfirst\s*:\s*([^}]*)\}\}/i) do
243
+ s = $1
244
+ s.empty? ? s : s[0].upcase + s[1..]
245
+ end
246
+
247
+ # URL encoding
248
+ result.gsub!(/\{\{\s*urlencode\s*:\s*([^}|]*?)(?:\s*\|\s*[^}]*)?\}\}/i) do
249
+ url_encode($1.strip)
250
+ end
251
+ result.gsub!(/\{\{\s*anchorencode\s*:\s*([^}]*)\}\}/i) do
252
+ anchor_encode($1.strip)
253
+ end
254
+
255
+ # Padding functions: {{padleft:string|length|pad}}
256
+ result.gsub!(/\{\{\s*padleft\s*:\s*([^}|]*)\s*\|\s*(\d+)(?:\s*\|\s*([^}]*))?\}\}/i) do
257
+ str, len, pad = $1, $2.to_i, ($3 || "0")
258
+ pad = "0" if pad.empty?
259
+ str.rjust(len, pad)
260
+ end
261
+ result.gsub!(/\{\{\s*padright\s*:\s*([^}|]*)\s*\|\s*(\d+)(?:\s*\|\s*([^}]*))?\}\}/i) do
262
+ str, len, pad = $1, $2.to_i, ($3 || "0")
263
+ pad = "0" if pad.empty?
264
+ str.ljust(len, pad)
265
+ end
266
+
267
+ # formatnum - format number with thousand separators
268
+ result.gsub!(/\{\{\s*formatnum\s*:\s*([^}|]*?)(?:\s*\|([^}]*))?\}\}/i) do
269
+ num_str = $1.strip
270
+ option = $2&.strip&.downcase
271
+
272
+ if option == "r"
273
+ # R option: remove formatting (return raw number)
274
+ num_str.gsub(/[,\s]/, "")
275
+ else
276
+ # Add thousand separators
277
+ format_number_with_commas(num_str)
278
+ end
279
+ end
280
+
281
+ # plural, grammar, gender - just return first argument (proper handling would need language rules)
282
+ result.gsub!(/\{\{\s*plural\s*:\s*[^}|]*\s*\|\s*([^}|]*)[^}]*\}\}/i) { $1 }
283
+ result.gsub!(/\{\{\s*grammar\s*:\s*[^}|]*\s*\|\s*([^}|]*)[^}]*\}\}/i) { $1 }
284
+ result.gsub!(/\{\{\s*gender\s*:\s*[^}|]*\s*\|\s*([^}|]*)[^}]*\}\}/i) { $1 }
285
+
286
+ # int - internationalization, just return the message name
287
+ result.gsub!(/\{\{\s*int\s*:\s*([^}|]*)[^}]*\}\}/i) { $1.strip }
288
+
289
+ result
290
+ end
291
+
292
+ # Expand #titleparts parser function
293
+ # {{#titleparts:pagename|number of segments|first segment}}
294
+ def expand_titleparts(text)
295
+ text.gsub(/\{\{\s*#titleparts\s*:\s*([^}|]+)(?:\s*\|\s*([^}|]*))?(?:\s*\|\s*([^}]*))?\}\}/i) do
296
+ pagename = $1.strip
297
+ num_segments = $2&.strip&.to_i
298
+ first_segment = ($3&.strip&.to_i || 1)
299
+
300
+ parts = pagename.split("/")
301
+ first_segment = 1 if first_segment < 1
302
+ first_idx = first_segment - 1
303
+
304
+ if num_segments && num_segments > 0
305
+ parts[first_idx, num_segments]&.join("/") || ""
306
+ elsif num_segments && num_segments < 0
307
+ # Negative means "all but last N"
308
+ end_idx = parts.length + num_segments
309
+ end_idx > first_idx ? parts[first_idx...end_idx].join("/") : ""
310
+ else
311
+ parts[first_idx..]&.join("/") || ""
312
+ end
313
+ end
314
+ end
315
+
316
+ # URL encode a string (for PAGENAMEE variants)
317
+ def url_encode(str)
318
+ return "" if str.nil?
319
+
320
+ str.gsub(/[^a-zA-Z0-9\-._~]/) do |c|
321
+ c == " " ? "_" : "%" + c.unpack1("H*").upcase
322
+ end
323
+ end
324
+
325
+ # Anchor encode (for fragment identifiers)
326
+ def anchor_encode(str)
327
+ return "" if str.nil?
328
+
329
+ str.gsub(" ", "_").gsub(/[^\w\-.]/) do |c|
330
+ ".#{c.unpack1('H*').upcase}"
331
+ end
332
+ end
333
+
334
+ # Format number with thousand separators
335
+ def format_number_with_commas(num_str)
336
+ return num_str if num_str.nil? || num_str.empty?
337
+
338
+ # Split integer and decimal parts
339
+ parts = num_str.split(".")
340
+ integer_part = parts[0] || ""
341
+ decimal_part = parts[1]
342
+
343
+ # Add thousand separators to integer part
344
+ integer_part = integer_part.gsub(/(\d)(?=(\d{3})+(?!\d))/, '\\1,')
345
+
346
+ if decimal_part
347
+ "#{integer_part}.#{decimal_part}"
348
+ else
349
+ integer_part
350
+ end
351
+ end
352
+ end
353
+ end