wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "sqlite3"
|
|
4
|
+
require "fileutils"
|
|
5
|
+
require "digest"
|
|
6
|
+
|
|
7
|
+
module Wp2txt
|
|
8
|
+
# SQLite-based cache for multistream index
|
|
9
|
+
# Dramatically speeds up repeated index access by storing parsed entries in SQLite
|
|
10
|
+
class IndexCache
|
|
11
|
+
CACHE_VERSION = 1
|
|
12
|
+
CACHE_SUFFIX = ".sqlite3"
|
|
13
|
+
|
|
14
|
+
attr_reader :cache_path, :source_path
|
|
15
|
+
|
|
16
|
+
def initialize(source_path, cache_dir: nil)
|
|
17
|
+
@source_path = source_path
|
|
18
|
+
@cache_dir = cache_dir || default_cache_dir
|
|
19
|
+
@cache_path = build_cache_path
|
|
20
|
+
@db = nil
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Check if cache exists and is valid for the current source file
|
|
24
|
+
# @return [Boolean] true if cache is usable
|
|
25
|
+
def valid?
|
|
26
|
+
return false unless File.exist?(@cache_path)
|
|
27
|
+
return false unless File.exist?(@source_path)
|
|
28
|
+
|
|
29
|
+
begin
|
|
30
|
+
open_db
|
|
31
|
+
meta = load_metadata
|
|
32
|
+
return false unless meta
|
|
33
|
+
|
|
34
|
+
# Check cache version
|
|
35
|
+
return false if meta[:cache_version].to_i != CACHE_VERSION
|
|
36
|
+
|
|
37
|
+
# Check source file hasn't changed
|
|
38
|
+
source_stat = File.stat(@source_path)
|
|
39
|
+
return false if meta[:source_mtime].to_i != source_stat.mtime.to_i
|
|
40
|
+
return false if meta[:source_size].to_i != source_stat.size
|
|
41
|
+
|
|
42
|
+
true
|
|
43
|
+
rescue SQLite3::Exception
|
|
44
|
+
false
|
|
45
|
+
ensure
|
|
46
|
+
close_db
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Load index entries from cache
|
|
51
|
+
# @return [Hash] { entries_by_title: {}, entries_by_id: {}, stream_offsets: [] }
|
|
52
|
+
def load
|
|
53
|
+
return nil unless valid?
|
|
54
|
+
|
|
55
|
+
entries_by_title = {}
|
|
56
|
+
entries_by_id = {}
|
|
57
|
+
stream_offsets = []
|
|
58
|
+
|
|
59
|
+
open_db
|
|
60
|
+
begin
|
|
61
|
+
# Load all entries
|
|
62
|
+
@db.execute("SELECT title, page_id, byte_offset FROM index_entries") do |row|
|
|
63
|
+
title, page_id, offset = row
|
|
64
|
+
entry = { offset: offset, page_id: page_id, title: title }
|
|
65
|
+
entries_by_title[title] = entry
|
|
66
|
+
entries_by_id[page_id] = entry
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Load stream offsets
|
|
70
|
+
@db.execute("SELECT byte_offset FROM stream_offsets ORDER BY byte_offset") do |row|
|
|
71
|
+
stream_offsets << row[0]
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
{ entries_by_title: entries_by_title, entries_by_id: entries_by_id, stream_offsets: stream_offsets }
|
|
75
|
+
ensure
|
|
76
|
+
close_db
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Save index entries to cache
|
|
81
|
+
# @param entries_by_title [Hash] title => entry hash
|
|
82
|
+
# @param stream_offsets [Array<Integer>] sorted stream offsets
|
|
83
|
+
def save(entries_by_title, stream_offsets)
|
|
84
|
+
FileUtils.mkdir_p(File.dirname(@cache_path))
|
|
85
|
+
|
|
86
|
+
# Remove old cache if exists
|
|
87
|
+
FileUtils.rm_f(@cache_path)
|
|
88
|
+
|
|
89
|
+
open_db
|
|
90
|
+
begin
|
|
91
|
+
create_schema
|
|
92
|
+
|
|
93
|
+
# Use transaction for better performance
|
|
94
|
+
@db.execute("BEGIN TRANSACTION")
|
|
95
|
+
|
|
96
|
+
# Save metadata
|
|
97
|
+
source_stat = File.stat(@source_path)
|
|
98
|
+
save_metadata(
|
|
99
|
+
source_path: @source_path,
|
|
100
|
+
source_mtime: source_stat.mtime.to_i,
|
|
101
|
+
source_size: source_stat.size,
|
|
102
|
+
cache_version: CACHE_VERSION,
|
|
103
|
+
entry_count: entries_by_title.size
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
# Save entries in batches for performance
|
|
107
|
+
stmt = @db.prepare("INSERT INTO index_entries (title, page_id, byte_offset) VALUES (?, ?, ?)")
|
|
108
|
+
entries_by_title.each do |title, entry|
|
|
109
|
+
stmt.execute([title, entry[:page_id], entry[:offset]])
|
|
110
|
+
end
|
|
111
|
+
stmt.close
|
|
112
|
+
|
|
113
|
+
# Save stream offsets
|
|
114
|
+
stmt = @db.prepare("INSERT INTO stream_offsets (byte_offset) VALUES (?)")
|
|
115
|
+
stream_offsets.each do |offset|
|
|
116
|
+
stmt.execute([offset])
|
|
117
|
+
end
|
|
118
|
+
stmt.close
|
|
119
|
+
|
|
120
|
+
@db.execute("COMMIT")
|
|
121
|
+
|
|
122
|
+
true
|
|
123
|
+
rescue SQLite3::Exception => e
|
|
124
|
+
@db.execute("ROLLBACK") rescue nil
|
|
125
|
+
FileUtils.rm_f(@cache_path)
|
|
126
|
+
raise e
|
|
127
|
+
ensure
|
|
128
|
+
close_db
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Find entries by titles (efficient batch lookup)
|
|
133
|
+
# @param titles [Array<String>] titles to look up
|
|
134
|
+
# @return [Hash] title => entry or nil
|
|
135
|
+
def find_by_titles(titles)
|
|
136
|
+
return {} if titles.empty?
|
|
137
|
+
return {} unless valid?
|
|
138
|
+
|
|
139
|
+
results = {}
|
|
140
|
+
open_db
|
|
141
|
+
begin
|
|
142
|
+
# Use IN clause with placeholders for batch lookup
|
|
143
|
+
placeholders = titles.map { "?" }.join(",")
|
|
144
|
+
sql = "SELECT title, page_id, byte_offset FROM index_entries WHERE title IN (#{placeholders})"
|
|
145
|
+
|
|
146
|
+
# SQLite3 2.x requires bind variables as an array
|
|
147
|
+
@db.execute(sql, titles) do |row|
|
|
148
|
+
title, page_id, offset = row
|
|
149
|
+
results[title] = { offset: offset, page_id: page_id, title: title }
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
results
|
|
153
|
+
ensure
|
|
154
|
+
close_db
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
# Get cache statistics
|
|
159
|
+
def stats
|
|
160
|
+
return nil unless File.exist?(@cache_path)
|
|
161
|
+
|
|
162
|
+
open_db
|
|
163
|
+
begin
|
|
164
|
+
meta = load_metadata
|
|
165
|
+
entry_count = @db.get_first_value("SELECT COUNT(*) FROM index_entries")
|
|
166
|
+
stream_count = @db.get_first_value("SELECT COUNT(*) FROM stream_offsets")
|
|
167
|
+
|
|
168
|
+
{
|
|
169
|
+
cache_path: @cache_path,
|
|
170
|
+
cache_size: File.size(@cache_path),
|
|
171
|
+
entry_count: entry_count,
|
|
172
|
+
stream_count: stream_count,
|
|
173
|
+
source_path: meta[:source_path],
|
|
174
|
+
source_mtime: meta[:source_mtime] ? Time.at(meta[:source_mtime].to_i) : nil,
|
|
175
|
+
cache_version: meta[:cache_version]
|
|
176
|
+
}
|
|
177
|
+
ensure
|
|
178
|
+
close_db
|
|
179
|
+
end
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
# Delete cache file
|
|
183
|
+
def clear!
|
|
184
|
+
FileUtils.rm_f(@cache_path)
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
private
|
|
188
|
+
|
|
189
|
+
def default_cache_dir
|
|
190
|
+
File.expand_path("~/.wp2txt/cache")
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
def build_cache_path
|
|
194
|
+
# Use source file basename + hash of full path for uniqueness
|
|
195
|
+
basename = File.basename(@source_path, ".*").sub(/-index$/, "")
|
|
196
|
+
path_hash = Digest::MD5.hexdigest(@source_path)[0, 8]
|
|
197
|
+
File.join(@cache_dir, "#{basename}_#{path_hash}#{CACHE_SUFFIX}")
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
def open_db
|
|
201
|
+
@db ||= SQLite3::Database.new(@cache_path)
|
|
202
|
+
# Performance optimizations
|
|
203
|
+
@db.execute("PRAGMA journal_mode = WAL")
|
|
204
|
+
@db.execute("PRAGMA synchronous = NORMAL")
|
|
205
|
+
@db.execute("PRAGMA cache_size = -64000") # 64MB cache
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
def close_db
|
|
209
|
+
@db&.close
|
|
210
|
+
@db = nil
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
def create_schema
|
|
214
|
+
@db.execute(<<~SQL)
|
|
215
|
+
CREATE TABLE IF NOT EXISTS metadata (
|
|
216
|
+
key TEXT PRIMARY KEY,
|
|
217
|
+
value TEXT
|
|
218
|
+
)
|
|
219
|
+
SQL
|
|
220
|
+
|
|
221
|
+
@db.execute(<<~SQL)
|
|
222
|
+
CREATE TABLE IF NOT EXISTS index_entries (
|
|
223
|
+
title TEXT PRIMARY KEY,
|
|
224
|
+
page_id INTEGER,
|
|
225
|
+
byte_offset INTEGER
|
|
226
|
+
)
|
|
227
|
+
SQL
|
|
228
|
+
|
|
229
|
+
@db.execute("CREATE INDEX IF NOT EXISTS idx_page_id ON index_entries(page_id)")
|
|
230
|
+
@db.execute("CREATE INDEX IF NOT EXISTS idx_byte_offset ON index_entries(byte_offset)")
|
|
231
|
+
|
|
232
|
+
@db.execute(<<~SQL)
|
|
233
|
+
CREATE TABLE IF NOT EXISTS stream_offsets (
|
|
234
|
+
byte_offset INTEGER PRIMARY KEY
|
|
235
|
+
)
|
|
236
|
+
SQL
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
def save_metadata(hash)
|
|
240
|
+
stmt = @db.prepare("INSERT OR REPLACE INTO metadata (key, value) VALUES (?, ?)")
|
|
241
|
+
hash.each do |key, value|
|
|
242
|
+
stmt.execute([key.to_s, value.to_s])
|
|
243
|
+
end
|
|
244
|
+
stmt.close
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
def load_metadata
|
|
248
|
+
result = {}
|
|
249
|
+
@db.execute("SELECT key, value FROM metadata") do |row|
|
|
250
|
+
key, value = row
|
|
251
|
+
result[key.to_sym] = value
|
|
252
|
+
end
|
|
253
|
+
result
|
|
254
|
+
rescue SQLite3::Exception
|
|
255
|
+
nil
|
|
256
|
+
end
|
|
257
|
+
end
|
|
258
|
+
end
|
|
@@ -0,0 +1,353 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "regex"
|
|
4
|
+
|
|
5
|
+
module Wp2txt
|
|
6
|
+
# Expands MediaWiki magic words to their actual values
|
|
7
|
+
# Supports: page context variables, date/time variables, string functions
|
|
8
|
+
class MagicWordExpander
|
|
9
|
+
# Page context magic words (case-insensitive)
|
|
10
|
+
PAGE_CONTEXT_WORDS = {
|
|
11
|
+
"PAGENAME" => :pagename,
|
|
12
|
+
"PAGENAMEE" => :pagename_encoded,
|
|
13
|
+
"FULLPAGENAME" => :fullpagename,
|
|
14
|
+
"FULLPAGENAMEE" => :fullpagename_encoded,
|
|
15
|
+
"BASEPAGENAME" => :basepagename,
|
|
16
|
+
"BASEPAGENAMEE" => :basepagename_encoded,
|
|
17
|
+
"ROOTPAGENAME" => :rootpagename,
|
|
18
|
+
"ROOTPAGENAMEE" => :rootpagename_encoded,
|
|
19
|
+
"SUBPAGENAME" => :subpagename,
|
|
20
|
+
"SUBPAGENAMEE" => :subpagename_encoded,
|
|
21
|
+
"TALKPAGENAME" => :talkpagename,
|
|
22
|
+
"TALKPAGENAMEE" => :talkpagename_encoded,
|
|
23
|
+
"SUBJECTPAGENAME" => :subjectpagename,
|
|
24
|
+
"SUBJECTPAGENAMEE" => :subjectpagename_encoded,
|
|
25
|
+
"ARTICLEPAGENAME" => :subjectpagename,
|
|
26
|
+
"ARTICLEPAGENAMEE" => :subjectpagename_encoded,
|
|
27
|
+
"NAMESPACE" => :namespace,
|
|
28
|
+
"NAMESPACEE" => :namespace_encoded,
|
|
29
|
+
"NAMESPACENUMBER" => :namespace_number,
|
|
30
|
+
"TALKSPACE" => :talkspace,
|
|
31
|
+
"TALKSPACEE" => :talkspace_encoded,
|
|
32
|
+
"SUBJECTSPACE" => :subjectspace,
|
|
33
|
+
"SUBJECTSPACEE" => :subjectspace_encoded,
|
|
34
|
+
"ARTICLESPACE" => :subjectspace,
|
|
35
|
+
"ARTICLESPACEE" => :subjectspace_encoded
|
|
36
|
+
}.freeze
|
|
37
|
+
|
|
38
|
+
# Date/time magic words
|
|
39
|
+
DATETIME_WORDS = {
|
|
40
|
+
"CURRENTYEAR" => :current_year,
|
|
41
|
+
"CURRENTMONTH" => :current_month,
|
|
42
|
+
"CURRENTMONTH1" => :current_month1,
|
|
43
|
+
"CURRENTMONTHNAME" => :current_month_name,
|
|
44
|
+
"CURRENTMONTHNAMEGEN" => :current_month_name,
|
|
45
|
+
"CURRENTMONTHABBREV" => :current_month_abbrev,
|
|
46
|
+
"CURRENTDAY" => :current_day,
|
|
47
|
+
"CURRENTDAY2" => :current_day2,
|
|
48
|
+
"CURRENTDOW" => :current_dow,
|
|
49
|
+
"CURRENTDAYNAME" => :current_day_name,
|
|
50
|
+
"CURRENTTIME" => :current_time,
|
|
51
|
+
"CURRENTHOUR" => :current_hour,
|
|
52
|
+
"CURRENTWEEK" => :current_week,
|
|
53
|
+
"CURRENTTIMESTAMP" => :current_timestamp,
|
|
54
|
+
# Local variants (same as current for our purposes)
|
|
55
|
+
"LOCALYEAR" => :current_year,
|
|
56
|
+
"LOCALMONTH" => :current_month,
|
|
57
|
+
"LOCALMONTH1" => :current_month1,
|
|
58
|
+
"LOCALMONTHNAME" => :current_month_name,
|
|
59
|
+
"LOCALMONTHNAMEGEN" => :current_month_name,
|
|
60
|
+
"LOCALMONTHABBREV" => :current_month_abbrev,
|
|
61
|
+
"LOCALDAY" => :current_day,
|
|
62
|
+
"LOCALDAY2" => :current_day2,
|
|
63
|
+
"LOCALDOW" => :current_dow,
|
|
64
|
+
"LOCALDAYNAME" => :current_day_name,
|
|
65
|
+
"LOCALTIME" => :current_time,
|
|
66
|
+
"LOCALHOUR" => :current_hour,
|
|
67
|
+
"LOCALWEEK" => :current_week,
|
|
68
|
+
"LOCALTIMESTAMP" => :current_timestamp
|
|
69
|
+
}.freeze
|
|
70
|
+
|
|
71
|
+
# String function magic words (with arguments)
|
|
72
|
+
STRING_FUNCTIONS = %w[
|
|
73
|
+
lc uc lcfirst ucfirst
|
|
74
|
+
padleft padright
|
|
75
|
+
anchorencode urlencode
|
|
76
|
+
plural grammar gender
|
|
77
|
+
int formatnum
|
|
78
|
+
].freeze
|
|
79
|
+
|
|
80
|
+
# Month names for expansion
|
|
81
|
+
MONTH_NAMES = %w[
|
|
82
|
+
January February March April May June
|
|
83
|
+
July August September October November December
|
|
84
|
+
].freeze
|
|
85
|
+
|
|
86
|
+
MONTH_ABBREVS = %w[
|
|
87
|
+
Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
|
|
88
|
+
].freeze
|
|
89
|
+
|
|
90
|
+
DAY_NAMES = %w[
|
|
91
|
+
Sunday Monday Tuesday Wednesday Thursday Friday Saturday
|
|
92
|
+
].freeze
|
|
93
|
+
|
|
94
|
+
def initialize(title, namespace: "", dump_date: nil)
|
|
95
|
+
@title = title || ""
|
|
96
|
+
@namespace = namespace || ""
|
|
97
|
+
@dump_date = dump_date || Time.now
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Main expansion method - expands all supported magic words in text
|
|
101
|
+
def expand(text)
|
|
102
|
+
return text if text.nil? || text.empty?
|
|
103
|
+
|
|
104
|
+
# Early exit: no templates to process
|
|
105
|
+
return text unless text.include?("{{")
|
|
106
|
+
|
|
107
|
+
result = text.dup
|
|
108
|
+
|
|
109
|
+
# Expand simple magic words: {{PAGENAME}}, {{CURRENTYEAR}}, etc.
|
|
110
|
+
result = expand_simple_magic_words(result)
|
|
111
|
+
|
|
112
|
+
# Expand string functions: {{lc:Text}}, {{uc:Text}}, etc.
|
|
113
|
+
result = expand_string_functions(result)
|
|
114
|
+
|
|
115
|
+
# Expand #titleparts parser function
|
|
116
|
+
result = expand_titleparts(result)
|
|
117
|
+
|
|
118
|
+
result
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
private
|
|
122
|
+
|
|
123
|
+
# Expand simple magic words without arguments
|
|
124
|
+
def expand_simple_magic_words(text)
|
|
125
|
+
# Match {{WORD}} pattern (case-insensitive for the word)
|
|
126
|
+
text.gsub(/\{\{\s*([A-Z][A-Z0-9]*)\s*\}\}/i) do |match|
|
|
127
|
+
word = $1.upcase
|
|
128
|
+
if PAGE_CONTEXT_WORDS.key?(word)
|
|
129
|
+
expand_page_context(PAGE_CONTEXT_WORDS[word])
|
|
130
|
+
elsif DATETIME_WORDS.key?(word)
|
|
131
|
+
expand_datetime(DATETIME_WORDS[word])
|
|
132
|
+
else
|
|
133
|
+
match # Return unchanged if not recognized
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# Expand page context magic words
|
|
139
|
+
def expand_page_context(type)
|
|
140
|
+
case type
|
|
141
|
+
when :pagename
|
|
142
|
+
@title
|
|
143
|
+
when :pagename_encoded
|
|
144
|
+
url_encode(@title)
|
|
145
|
+
when :fullpagename
|
|
146
|
+
@namespace.empty? ? @title : "#{@namespace}:#{@title}"
|
|
147
|
+
when :fullpagename_encoded
|
|
148
|
+
url_encode(@namespace.empty? ? @title : "#{@namespace}:#{@title}")
|
|
149
|
+
when :basepagename
|
|
150
|
+
# Remove subpage part (after last /)
|
|
151
|
+
@title.sub(%r{/[^/]*$}, "")
|
|
152
|
+
when :basepagename_encoded
|
|
153
|
+
url_encode(@title.sub(%r{/[^/]*$}, ""))
|
|
154
|
+
when :rootpagename
|
|
155
|
+
# Get root page (before first /)
|
|
156
|
+
@title.split("/").first || @title
|
|
157
|
+
when :rootpagename_encoded
|
|
158
|
+
url_encode(@title.split("/").first || @title)
|
|
159
|
+
when :subpagename
|
|
160
|
+
# Get subpage part (after last /)
|
|
161
|
+
@title.include?("/") ? @title.split("/").last : @title
|
|
162
|
+
when :subpagename_encoded
|
|
163
|
+
part = @title.include?("/") ? @title.split("/").last : @title
|
|
164
|
+
url_encode(part)
|
|
165
|
+
when :talkpagename
|
|
166
|
+
ns = @namespace.empty? ? "Talk" : "#{@namespace} talk"
|
|
167
|
+
"#{ns}:#{@title}"
|
|
168
|
+
when :talkpagename_encoded
|
|
169
|
+
ns = @namespace.empty? ? "Talk" : "#{@namespace}_talk"
|
|
170
|
+
url_encode("#{ns}:#{@title}")
|
|
171
|
+
when :subjectpagename
|
|
172
|
+
@namespace.empty? ? @title : "#{@namespace}:#{@title}"
|
|
173
|
+
when :subjectpagename_encoded
|
|
174
|
+
url_encode(@namespace.empty? ? @title : "#{@namespace}:#{@title}")
|
|
175
|
+
when :namespace
|
|
176
|
+
@namespace
|
|
177
|
+
when :namespace_encoded
|
|
178
|
+
url_encode(@namespace)
|
|
179
|
+
when :namespace_number
|
|
180
|
+
# Main namespace = 0, others would need a lookup table
|
|
181
|
+
@namespace.empty? ? "0" : ""
|
|
182
|
+
when :talkspace
|
|
183
|
+
@namespace.empty? ? "Talk" : "#{@namespace} talk"
|
|
184
|
+
when :talkspace_encoded
|
|
185
|
+
url_encode(@namespace.empty? ? "Talk" : "#{@namespace} talk")
|
|
186
|
+
when :subjectspace
|
|
187
|
+
@namespace
|
|
188
|
+
when :subjectspace_encoded
|
|
189
|
+
url_encode(@namespace)
|
|
190
|
+
else
|
|
191
|
+
""
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# Expand date/time magic words
|
|
196
|
+
def expand_datetime(type)
|
|
197
|
+
case type
|
|
198
|
+
when :current_year
|
|
199
|
+
@dump_date.year.to_s
|
|
200
|
+
when :current_month
|
|
201
|
+
@dump_date.month.to_s.rjust(2, "0")
|
|
202
|
+
when :current_month1
|
|
203
|
+
@dump_date.month.to_s
|
|
204
|
+
when :current_month_name
|
|
205
|
+
MONTH_NAMES[@dump_date.month - 1]
|
|
206
|
+
when :current_month_abbrev
|
|
207
|
+
MONTH_ABBREVS[@dump_date.month - 1]
|
|
208
|
+
when :current_day
|
|
209
|
+
@dump_date.day.to_s
|
|
210
|
+
when :current_day2
|
|
211
|
+
@dump_date.day.to_s.rjust(2, "0")
|
|
212
|
+
when :current_dow
|
|
213
|
+
@dump_date.wday.to_s
|
|
214
|
+
when :current_day_name
|
|
215
|
+
DAY_NAMES[@dump_date.wday]
|
|
216
|
+
when :current_time
|
|
217
|
+
@dump_date.strftime("%H:%M")
|
|
218
|
+
when :current_hour
|
|
219
|
+
@dump_date.hour.to_s.rjust(2, "0")
|
|
220
|
+
when :current_week
|
|
221
|
+
@dump_date.strftime("%V")
|
|
222
|
+
when :current_timestamp
|
|
223
|
+
@dump_date.strftime("%Y%m%d%H%M%S")
|
|
224
|
+
else
|
|
225
|
+
""
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
# Expand string functions: {{lc:Text}}, {{uc:Text}}, etc.
|
|
230
|
+
def expand_string_functions(text)
|
|
231
|
+
# Match {{function:argument}} pattern
|
|
232
|
+
# Need to handle nested braces carefully
|
|
233
|
+
result = text.dup
|
|
234
|
+
|
|
235
|
+
# Simple string case functions (no nesting issues)
|
|
236
|
+
result.gsub!(/\{\{\s*lc\s*:\s*([^}]*)\}\}/i) { $1.downcase }
|
|
237
|
+
result.gsub!(/\{\{\s*uc\s*:\s*([^}]*)\}\}/i) { $1.upcase }
|
|
238
|
+
result.gsub!(/\{\{\s*lcfirst\s*:\s*([^}]*)\}\}/i) do
|
|
239
|
+
s = $1
|
|
240
|
+
s.empty? ? s : s[0].downcase + s[1..]
|
|
241
|
+
end
|
|
242
|
+
result.gsub!(/\{\{\s*ucfirst\s*:\s*([^}]*)\}\}/i) do
|
|
243
|
+
s = $1
|
|
244
|
+
s.empty? ? s : s[0].upcase + s[1..]
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
# URL encoding
|
|
248
|
+
result.gsub!(/\{\{\s*urlencode\s*:\s*([^}|]*?)(?:\s*\|\s*[^}]*)?\}\}/i) do
|
|
249
|
+
url_encode($1.strip)
|
|
250
|
+
end
|
|
251
|
+
result.gsub!(/\{\{\s*anchorencode\s*:\s*([^}]*)\}\}/i) do
|
|
252
|
+
anchor_encode($1.strip)
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
# Padding functions: {{padleft:string|length|pad}}
|
|
256
|
+
result.gsub!(/\{\{\s*padleft\s*:\s*([^}|]*)\s*\|\s*(\d+)(?:\s*\|\s*([^}]*))?\}\}/i) do
|
|
257
|
+
str, len, pad = $1, $2.to_i, ($3 || "0")
|
|
258
|
+
pad = "0" if pad.empty?
|
|
259
|
+
str.rjust(len, pad)
|
|
260
|
+
end
|
|
261
|
+
result.gsub!(/\{\{\s*padright\s*:\s*([^}|]*)\s*\|\s*(\d+)(?:\s*\|\s*([^}]*))?\}\}/i) do
|
|
262
|
+
str, len, pad = $1, $2.to_i, ($3 || "0")
|
|
263
|
+
pad = "0" if pad.empty?
|
|
264
|
+
str.ljust(len, pad)
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
# formatnum - format number with thousand separators
|
|
268
|
+
result.gsub!(/\{\{\s*formatnum\s*:\s*([^}|]*?)(?:\s*\|([^}]*))?\}\}/i) do
|
|
269
|
+
num_str = $1.strip
|
|
270
|
+
option = $2&.strip&.downcase
|
|
271
|
+
|
|
272
|
+
if option == "r"
|
|
273
|
+
# R option: remove formatting (return raw number)
|
|
274
|
+
num_str.gsub(/[,\s]/, "")
|
|
275
|
+
else
|
|
276
|
+
# Add thousand separators
|
|
277
|
+
format_number_with_commas(num_str)
|
|
278
|
+
end
|
|
279
|
+
end
|
|
280
|
+
|
|
281
|
+
# plural, grammar, gender - just return first argument (proper handling would need language rules)
|
|
282
|
+
result.gsub!(/\{\{\s*plural\s*:\s*[^}|]*\s*\|\s*([^}|]*)[^}]*\}\}/i) { $1 }
|
|
283
|
+
result.gsub!(/\{\{\s*grammar\s*:\s*[^}|]*\s*\|\s*([^}|]*)[^}]*\}\}/i) { $1 }
|
|
284
|
+
result.gsub!(/\{\{\s*gender\s*:\s*[^}|]*\s*\|\s*([^}|]*)[^}]*\}\}/i) { $1 }
|
|
285
|
+
|
|
286
|
+
# int - internationalization, just return the message name
|
|
287
|
+
result.gsub!(/\{\{\s*int\s*:\s*([^}|]*)[^}]*\}\}/i) { $1.strip }
|
|
288
|
+
|
|
289
|
+
result
|
|
290
|
+
end
|
|
291
|
+
|
|
292
|
+
# Expand #titleparts parser function
|
|
293
|
+
# {{#titleparts:pagename|number of segments|first segment}}
|
|
294
|
+
def expand_titleparts(text)
|
|
295
|
+
text.gsub(/\{\{\s*#titleparts\s*:\s*([^}|]+)(?:\s*\|\s*([^}|]*))?(?:\s*\|\s*([^}]*))?\}\}/i) do
|
|
296
|
+
pagename = $1.strip
|
|
297
|
+
num_segments = $2&.strip&.to_i
|
|
298
|
+
first_segment = ($3&.strip&.to_i || 1)
|
|
299
|
+
|
|
300
|
+
parts = pagename.split("/")
|
|
301
|
+
first_segment = 1 if first_segment < 1
|
|
302
|
+
first_idx = first_segment - 1
|
|
303
|
+
|
|
304
|
+
if num_segments && num_segments > 0
|
|
305
|
+
parts[first_idx, num_segments]&.join("/") || ""
|
|
306
|
+
elsif num_segments && num_segments < 0
|
|
307
|
+
# Negative means "all but last N"
|
|
308
|
+
end_idx = parts.length + num_segments
|
|
309
|
+
end_idx > first_idx ? parts[first_idx...end_idx].join("/") : ""
|
|
310
|
+
else
|
|
311
|
+
parts[first_idx..]&.join("/") || ""
|
|
312
|
+
end
|
|
313
|
+
end
|
|
314
|
+
end
|
|
315
|
+
|
|
316
|
+
# URL encode a string (for PAGENAMEE variants)
|
|
317
|
+
def url_encode(str)
|
|
318
|
+
return "" if str.nil?
|
|
319
|
+
|
|
320
|
+
str.gsub(/[^a-zA-Z0-9\-._~]/) do |c|
|
|
321
|
+
c == " " ? "_" : "%" + c.unpack1("H*").upcase
|
|
322
|
+
end
|
|
323
|
+
end
|
|
324
|
+
|
|
325
|
+
# Anchor encode (for fragment identifiers)
|
|
326
|
+
def anchor_encode(str)
|
|
327
|
+
return "" if str.nil?
|
|
328
|
+
|
|
329
|
+
str.gsub(" ", "_").gsub(/[^\w\-.]/) do |c|
|
|
330
|
+
".#{c.unpack1('H*').upcase}"
|
|
331
|
+
end
|
|
332
|
+
end
|
|
333
|
+
|
|
334
|
+
# Format number with thousand separators
|
|
335
|
+
def format_number_with_commas(num_str)
|
|
336
|
+
return num_str if num_str.nil? || num_str.empty?
|
|
337
|
+
|
|
338
|
+
# Split integer and decimal parts
|
|
339
|
+
parts = num_str.split(".")
|
|
340
|
+
integer_part = parts[0] || ""
|
|
341
|
+
decimal_part = parts[1]
|
|
342
|
+
|
|
343
|
+
# Add thousand separators to integer part
|
|
344
|
+
integer_part = integer_part.gsub(/(\d)(?=(\d{3})+(?!\d))/, '\\1,')
|
|
345
|
+
|
|
346
|
+
if decimal_part
|
|
347
|
+
"#{integer_part}.#{decimal_part}"
|
|
348
|
+
else
|
|
349
|
+
integer_part
|
|
350
|
+
end
|
|
351
|
+
end
|
|
352
|
+
end
|
|
353
|
+
end
|