wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
data/lib/wp2txt/version.rb
CHANGED
data/lib/wp2txt.rb
CHANGED
|
@@ -3,17 +3,24 @@
|
|
|
3
3
|
require "nokogiri"
|
|
4
4
|
require_relative "wp2txt/article"
|
|
5
5
|
require_relative "wp2txt/utils"
|
|
6
|
+
require_relative "wp2txt/stream_processor"
|
|
7
|
+
require_relative "wp2txt/output_writer"
|
|
6
8
|
|
|
7
9
|
module Wp2txt
|
|
8
10
|
class Splitter
|
|
9
11
|
include Wp2txt
|
|
10
|
-
|
|
12
|
+
|
|
13
|
+
attr_reader :size_read, :file_index
|
|
14
|
+
|
|
15
|
+
def initialize(input_file, output_dir = ".", tfile_size = 10, bz2_gem = false, &progress_callback)
|
|
11
16
|
@fp = nil
|
|
12
17
|
@input_file = input_file
|
|
13
18
|
@output_dir = output_dir
|
|
14
19
|
@tfile_size = tfile_size
|
|
15
20
|
require "bzip2-ruby" if bz2_gem
|
|
16
21
|
@bz2_gem = bz2_gem
|
|
22
|
+
@progress_callback = progress_callback
|
|
23
|
+
@last_progress_time = Time.now
|
|
17
24
|
prepare
|
|
18
25
|
end
|
|
19
26
|
|
|
@@ -26,7 +33,7 @@ module Wp2txt
|
|
|
26
33
|
loop do
|
|
27
34
|
begin
|
|
28
35
|
a = file.read(unit)
|
|
29
|
-
rescue
|
|
36
|
+
rescue IOError, Errno::EIO, Errno::ENOENT
|
|
30
37
|
a = nil
|
|
31
38
|
end
|
|
32
39
|
break unless a
|
|
@@ -46,20 +53,22 @@ module Wp2txt
|
|
|
46
53
|
# check if a given command exists: return the path if it does, return false if not
|
|
47
54
|
def command_exist?(command)
|
|
48
55
|
basename = File.basename(command)
|
|
49
|
-
path = +""
|
|
50
56
|
print "Checking #{basename}: "
|
|
51
57
|
begin
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
58
|
+
# Use IO.popen instead of open("| ...") for Ruby 4.0 compatibility
|
|
59
|
+
path = IO.popen(["which", command], err: File::NULL, &:read).strip
|
|
60
|
+
if path.empty?
|
|
61
|
+
path = IO.popen(["which", basename], err: File::NULL, &:read).strip
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
if path.empty?
|
|
59
65
|
puts "#{basename} not found"
|
|
60
66
|
false
|
|
67
|
+
else
|
|
68
|
+
puts "detected [#{path}]"
|
|
69
|
+
path
|
|
61
70
|
end
|
|
62
|
-
rescue
|
|
71
|
+
rescue Errno::ENOENT, Errno::EPIPE, IOError
|
|
63
72
|
puts "#{basename} not found"
|
|
64
73
|
false
|
|
65
74
|
end
|
|
@@ -75,13 +84,13 @@ module Wp2txt
|
|
|
75
84
|
if @bz2_gem
|
|
76
85
|
file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
|
|
77
86
|
elsif Gem.win_platform?
|
|
78
|
-
file = IO.popen("bunzip2.exe -c
|
|
87
|
+
file = IO.popen(["bunzip2.exe", "-c", @input_file])
|
|
79
88
|
elsif (bzpath = command_exist?("lbzip2") || command_exist?("pbzip2") || command_exist?("bzip2"))
|
|
80
|
-
file = IO.popen(
|
|
89
|
+
file = IO.popen([bzpath, "-c", "-d", @input_file])
|
|
81
90
|
end
|
|
82
91
|
else # meaning that it is a text file
|
|
83
92
|
@infile_size = File.stat(@input_file).size
|
|
84
|
-
file = open(@input_file)
|
|
93
|
+
file = File.open(@input_file, "r:UTF-8")
|
|
85
94
|
end
|
|
86
95
|
|
|
87
96
|
# create basename of output file
|
|
@@ -101,7 +110,7 @@ module Wp2txt
|
|
|
101
110
|
loop do
|
|
102
111
|
begin
|
|
103
112
|
new_lines = @file_pointer.read(10_485_760)
|
|
104
|
-
rescue
|
|
113
|
+
rescue IOError, Errno::EIO, Errno::ENOENT, Errno::EPIPE
|
|
105
114
|
return nil
|
|
106
115
|
end
|
|
107
116
|
return nil unless new_lines
|
|
@@ -114,9 +123,10 @@ module Wp2txt
|
|
|
114
123
|
|
|
115
124
|
new_first_line = temp_buf.shift
|
|
116
125
|
@buffer.last << new_first_line
|
|
117
|
-
|
|
118
|
-
@buffer
|
|
119
|
-
@buffer
|
|
126
|
+
# Use end_with? instead of [-1, 1] for clarity and performance
|
|
127
|
+
@buffer << +"" if new_first_line.end_with?("\n")
|
|
128
|
+
@buffer.concat(temp_buf) unless temp_buf.empty?
|
|
129
|
+
@buffer << +"" if @buffer.last.end_with?("\n")
|
|
120
130
|
break if @buffer.size > 1
|
|
121
131
|
end
|
|
122
132
|
true
|
|
@@ -144,6 +154,10 @@ module Wp2txt
|
|
|
144
154
|
@total_size += text.bytesize
|
|
145
155
|
output_text << text
|
|
146
156
|
end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
|
|
157
|
+
|
|
158
|
+
# Report progress every 5 seconds
|
|
159
|
+
report_progress
|
|
160
|
+
|
|
147
161
|
# never close the file until the end of the page even if end_flag is on
|
|
148
162
|
next unless end_flag && %r{</page} =~ text
|
|
149
163
|
|
|
@@ -157,7 +171,7 @@ module Wp2txt
|
|
|
157
171
|
@outfiles << outfilename
|
|
158
172
|
@fp = File.open(outfilename, "w")
|
|
159
173
|
end
|
|
160
|
-
@fp.puts(output_text)
|
|
174
|
+
@fp.puts(output_text) unless output_text.empty?
|
|
161
175
|
@fp.close
|
|
162
176
|
|
|
163
177
|
if outfilename && File.size(outfilename).zero?
|
|
@@ -167,6 +181,18 @@ module Wp2txt
|
|
|
167
181
|
|
|
168
182
|
rename(@outfiles, "xml")
|
|
169
183
|
end
|
|
184
|
+
|
|
185
|
+
private
|
|
186
|
+
|
|
187
|
+
def report_progress
|
|
188
|
+
return unless @progress_callback
|
|
189
|
+
|
|
190
|
+
now = Time.now
|
|
191
|
+
return if now - @last_progress_time < 5 # Report every 5 seconds
|
|
192
|
+
|
|
193
|
+
@last_progress_time = now
|
|
194
|
+
@progress_callback.call(@size_read, @file_index)
|
|
195
|
+
end
|
|
170
196
|
end
|
|
171
197
|
|
|
172
198
|
class Runner
|
|
@@ -183,7 +209,7 @@ module Wp2txt
|
|
|
183
209
|
|
|
184
210
|
def prepare
|
|
185
211
|
@infile_size = File.stat(@input_file).size
|
|
186
|
-
file = open(@input_file)
|
|
212
|
+
file = File.open(@input_file, "r:UTF-8")
|
|
187
213
|
@file_pointer = file
|
|
188
214
|
@outfile_base = File.basename(@input_file, ".*")
|
|
189
215
|
@total_size = 0
|
|
@@ -194,7 +220,7 @@ module Wp2txt
|
|
|
194
220
|
loop do
|
|
195
221
|
begin
|
|
196
222
|
new_lines = @file_pointer.read(10_485_760)
|
|
197
|
-
rescue
|
|
223
|
+
rescue IOError, Errno::EIO, Errno::ENOENT, Errno::EPIPE
|
|
198
224
|
return nil
|
|
199
225
|
end
|
|
200
226
|
return nil unless new_lines
|
|
@@ -206,10 +232,11 @@ module Wp2txt
|
|
|
206
232
|
temp_buf << ss.rest unless ss.eos?
|
|
207
233
|
|
|
208
234
|
new_first_line = temp_buf.shift
|
|
209
|
-
@buffer.last <<
|
|
210
|
-
|
|
211
|
-
@buffer
|
|
212
|
-
@buffer
|
|
235
|
+
@buffer.last << new_first_line
|
|
236
|
+
# Use end_with? instead of [-1, 1] for clarity and performance
|
|
237
|
+
@buffer << +"" if new_first_line.end_with?("\n")
|
|
238
|
+
@buffer.concat(temp_buf) unless temp_buf.empty?
|
|
239
|
+
@buffer << +"" if @buffer.last.end_with?("\n")
|
|
213
240
|
break if @buffer.size > 1
|
|
214
241
|
end
|
|
215
242
|
true
|
|
@@ -247,7 +274,7 @@ module Wp2txt
|
|
|
247
274
|
else
|
|
248
275
|
page.force_encoding("utf-8")
|
|
249
276
|
end
|
|
250
|
-
rescue
|
|
277
|
+
rescue ::Encoding::InvalidByteSequenceError, ::Encoding::UndefinedConversionError
|
|
251
278
|
page
|
|
252
279
|
end
|
|
253
280
|
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Benchmark script for wp2txt regex performance
|
|
4
|
+
# Compares pre-compiled regex patterns vs inline compilation
|
|
5
|
+
#
|
|
6
|
+
# Usage: ruby scripts/benchmark_regex.rb
|
|
7
|
+
|
|
8
|
+
require "benchmark"
|
|
9
|
+
begin
|
|
10
|
+
require "benchmark/ips"
|
|
11
|
+
rescue LoadError
|
|
12
|
+
# benchmark-ips is optional
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# Add lib to load path
|
|
16
|
+
$LOAD_PATH.unshift(File.expand_path("../lib", __dir__))
|
|
17
|
+
require "wp2txt"
|
|
18
|
+
require "wp2txt/article"
|
|
19
|
+
|
|
20
|
+
# Sample Wikipedia-like content for benchmarking
|
|
21
|
+
SAMPLE_TEXT = <<~WIKI
|
|
22
|
+
{{Infobox person
|
|
23
|
+
| name = Test Person
|
|
24
|
+
| birth_date = 1980-01-01
|
|
25
|
+
| occupation = Writer
|
|
26
|
+
}}
|
|
27
|
+
'''Test Article''' is a [[test]] article with various [[wiki markup|markup]].
|
|
28
|
+
|
|
29
|
+
== Section 1 ==
|
|
30
|
+
This section has some {{cite web|url=http://example.com|title=Example}} references.
|
|
31
|
+
There are also [[Category:Test]] links and [[File:Image.jpg|thumb|A caption]].
|
|
32
|
+
|
|
33
|
+
=== Subsection ===
|
|
34
|
+
More content with '''bold''' and ''italic'' text.
|
|
35
|
+
* List item 1
|
|
36
|
+
* List item 2
|
|
37
|
+
# Numbered item
|
|
38
|
+
|
|
39
|
+
== Section 2 ==
|
|
40
|
+
{| class="wikitable"
|
|
41
|
+
|-
|
|
42
|
+
! Header 1 !! Header 2
|
|
43
|
+
|-
|
|
44
|
+
| Cell 1 || Cell 2
|
|
45
|
+
|}
|
|
46
|
+
|
|
47
|
+
Some text with entities and ♪ characters.
|
|
48
|
+
Also has <ref name="test">Reference content</ref> and <nowiki>[[preserved]]</nowiki>.
|
|
49
|
+
|
|
50
|
+
{{DEFAULTSORT:Test Article}}
|
|
51
|
+
[[Category:Articles]]
|
|
52
|
+
[[Category:Tests]]
|
|
53
|
+
WIKI
|
|
54
|
+
|
|
55
|
+
# Create multiple copies for more realistic benchmarking
|
|
56
|
+
LARGE_TEXT = (SAMPLE_TEXT * 100).freeze
|
|
57
|
+
|
|
58
|
+
class BenchmarkRunner
|
|
59
|
+
include Wp2txt
|
|
60
|
+
|
|
61
|
+
def initialize
|
|
62
|
+
@nowikis = {}
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def run_cleanup(text)
|
|
66
|
+
cleanup(text.dup)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def run_full_format(text)
|
|
70
|
+
format_wiki(text.dup)
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def run_benchmarks
|
|
75
|
+
puts "=" * 60
|
|
76
|
+
puts "wp2txt Regex Performance Benchmark"
|
|
77
|
+
puts "=" * 60
|
|
78
|
+
puts
|
|
79
|
+
puts "Ruby version: #{RUBY_VERSION}"
|
|
80
|
+
puts "Sample text size: #{SAMPLE_TEXT.bytesize} bytes"
|
|
81
|
+
puts "Large text size: #{LARGE_TEXT.bytesize} bytes"
|
|
82
|
+
puts
|
|
83
|
+
|
|
84
|
+
runner = BenchmarkRunner.new
|
|
85
|
+
|
|
86
|
+
puts "-" * 60
|
|
87
|
+
puts "Warmup (JIT compilation, method caching)"
|
|
88
|
+
puts "-" * 60
|
|
89
|
+
5.times { runner.run_cleanup(SAMPLE_TEXT) }
|
|
90
|
+
5.times { runner.run_full_format(SAMPLE_TEXT) }
|
|
91
|
+
puts "Done."
|
|
92
|
+
puts
|
|
93
|
+
|
|
94
|
+
puts "-" * 60
|
|
95
|
+
puts "Benchmark: cleanup() method"
|
|
96
|
+
puts "-" * 60
|
|
97
|
+
|
|
98
|
+
Benchmark.bm(20) do |x|
|
|
99
|
+
x.report("cleanup (small):") do
|
|
100
|
+
1000.times { runner.run_cleanup(SAMPLE_TEXT) }
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
x.report("cleanup (large):") do
|
|
104
|
+
10.times { runner.run_cleanup(LARGE_TEXT) }
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
puts
|
|
109
|
+
puts "-" * 60
|
|
110
|
+
puts "Benchmark: format_wiki() method (full pipeline)"
|
|
111
|
+
puts "-" * 60
|
|
112
|
+
|
|
113
|
+
Benchmark.bm(20) do |x|
|
|
114
|
+
x.report("format_wiki (small):") do
|
|
115
|
+
1000.times { runner.run_full_format(SAMPLE_TEXT) }
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
x.report("format_wiki (large):") do
|
|
119
|
+
10.times { runner.run_full_format(LARGE_TEXT) }
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# If benchmark-ips is available, run IPS benchmarks
|
|
124
|
+
if defined?(Benchmark::IPS)
|
|
125
|
+
puts
|
|
126
|
+
puts "-" * 60
|
|
127
|
+
puts "IPS Benchmark (iterations per second)"
|
|
128
|
+
puts "-" * 60
|
|
129
|
+
|
|
130
|
+
Benchmark.ips do |x|
|
|
131
|
+
x.report("cleanup") { runner.run_cleanup(SAMPLE_TEXT) }
|
|
132
|
+
x.report("format_wiki") { runner.run_full_format(SAMPLE_TEXT) }
|
|
133
|
+
x.compare!
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
puts
|
|
138
|
+
puts "-" * 60
|
|
139
|
+
puts "Memory profile (approximate)"
|
|
140
|
+
puts "-" * 60
|
|
141
|
+
|
|
142
|
+
# Simple memory measurement
|
|
143
|
+
GC.start
|
|
144
|
+
before = GC.stat[:total_allocated_objects]
|
|
145
|
+
100.times { runner.run_cleanup(SAMPLE_TEXT) }
|
|
146
|
+
after = GC.stat[:total_allocated_objects]
|
|
147
|
+
puts "cleanup() allocations per call: ~#{(after - before) / 100}"
|
|
148
|
+
|
|
149
|
+
GC.start
|
|
150
|
+
before = GC.stat[:total_allocated_objects]
|
|
151
|
+
100.times { runner.run_full_format(SAMPLE_TEXT) }
|
|
152
|
+
after = GC.stat[:total_allocated_objects]
|
|
153
|
+
puts "format_wiki() allocations per call: ~#{(after - before) / 100}"
|
|
154
|
+
|
|
155
|
+
puts
|
|
156
|
+
puts "=" * 60
|
|
157
|
+
puts "Benchmark complete"
|
|
158
|
+
puts "=" * 60
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
run_benchmarks
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Fetches HTML named character references from WHATWG HTML specification
|
|
4
|
+
# Usage: ruby scripts/fetch_html_entities.rb
|
|
5
|
+
#
|
|
6
|
+
# This script downloads the official entities.json from WHATWG and converts
|
|
7
|
+
# it into a format suitable for wp2txt text processing.
|
|
8
|
+
|
|
9
|
+
require "net/http"
|
|
10
|
+
require "json"
|
|
11
|
+
require "fileutils"
|
|
12
|
+
|
|
13
|
+
WHATWG_ENTITIES_URL = "https://html.spec.whatwg.org/entities.json"
|
|
14
|
+
|
|
15
|
+
def fetch_whatwg_entities
|
|
16
|
+
puts "Fetching entities from WHATWG HTML specification..."
|
|
17
|
+
uri = URI(WHATWG_ENTITIES_URL)
|
|
18
|
+
|
|
19
|
+
response = Net::HTTP.get_response(uri)
|
|
20
|
+
unless response.is_a?(Net::HTTPSuccess)
|
|
21
|
+
warn "Failed to fetch entities: HTTP #{response.code}"
|
|
22
|
+
return nil
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
JSON.parse(response.body)
|
|
26
|
+
rescue StandardError => e
|
|
27
|
+
warn "Error fetching entities: #{e.message}"
|
|
28
|
+
nil
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def convert_entities(raw_data)
|
|
32
|
+
entities = {}
|
|
33
|
+
|
|
34
|
+
raw_data.each do |name, info|
|
|
35
|
+
# Only include entries with semicolon (standard form)
|
|
36
|
+
# Skip legacy forms without semicolon like " "
|
|
37
|
+
next unless name.end_with?(";")
|
|
38
|
+
|
|
39
|
+
# Extract entity name without & and ;
|
|
40
|
+
# e.g., "α" -> "alpha"
|
|
41
|
+
key = name
|
|
42
|
+
|
|
43
|
+
# Get the character(s)
|
|
44
|
+
characters = info["characters"]
|
|
45
|
+
next if characters.nil? || characters.empty?
|
|
46
|
+
|
|
47
|
+
entities[key] = characters
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
entities
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def main
|
|
54
|
+
raw_data = fetch_whatwg_entities
|
|
55
|
+
if raw_data.nil?
|
|
56
|
+
warn "Failed to fetch entities. Aborting."
|
|
57
|
+
exit 1
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
puts "Processing #{raw_data.size} raw entries..."
|
|
61
|
+
|
|
62
|
+
entities = convert_entities(raw_data)
|
|
63
|
+
puts "Converted to #{entities.size} standard entities (with semicolon)"
|
|
64
|
+
|
|
65
|
+
result = {
|
|
66
|
+
"meta" => {
|
|
67
|
+
"generated_at" => Time.now.utc.iso8601,
|
|
68
|
+
"source" => WHATWG_ENTITIES_URL,
|
|
69
|
+
"description" => "HTML named character references from WHATWG HTML specification",
|
|
70
|
+
"total_entities" => entities.size
|
|
71
|
+
},
|
|
72
|
+
"entities" => entities.sort.to_h
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
# Write output
|
|
76
|
+
output_path = File.join(__dir__, "..", "lib", "wp2txt", "data", "html_entities.json")
|
|
77
|
+
FileUtils.mkdir_p(File.dirname(output_path))
|
|
78
|
+
|
|
79
|
+
File.write(output_path, JSON.pretty_generate(result))
|
|
80
|
+
puts "\nData written to: #{output_path}"
|
|
81
|
+
|
|
82
|
+
# Summary - show some categories
|
|
83
|
+
greek = entities.keys.select { |k| k.match?(/&(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega);/i) }
|
|
84
|
+
math = entities.keys.select { |k| k.match?(/&(sum|prod|int|infin|nabla|part|forall|exist|empty|isin|notin|cap|cup|sub|sup|oplus|otimes);/i) }
|
|
85
|
+
arrows = entities.keys.select { |k| k.match?(/arr;$/i) }
|
|
86
|
+
|
|
87
|
+
puts "\n=== Summary ==="
|
|
88
|
+
puts "Total entities: #{entities.size}"
|
|
89
|
+
puts "Greek letters: #{greek.size}"
|
|
90
|
+
puts "Math symbols: #{math.size}"
|
|
91
|
+
puts "Arrows: #{arrows.size}"
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
main if __FILE__ == $PROGRAM_NAME
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Fetches Wikipedia language metadata from Wikimedia APIs
|
|
4
|
+
# Usage: ruby scripts/fetch_language_metadata.rb
|
|
5
|
+
#
|
|
6
|
+
# This script queries the Wikimedia sitematrix API to get all Wikipedia
|
|
7
|
+
# language editions and their statistics (article counts, etc.)
|
|
8
|
+
|
|
9
|
+
require "net/http"
|
|
10
|
+
require "json"
|
|
11
|
+
require "fileutils"
|
|
12
|
+
|
|
13
|
+
# Fetch all Wikipedia languages with statistics from sitematrix API
|
|
14
|
+
def fetch_wikipedia_languages
|
|
15
|
+
uri = URI("https://meta.wikimedia.org/w/api.php")
|
|
16
|
+
params = {
|
|
17
|
+
action: "sitematrix",
|
|
18
|
+
smtype: "language",
|
|
19
|
+
format: "json"
|
|
20
|
+
}
|
|
21
|
+
uri.query = URI.encode_www_form(params)
|
|
22
|
+
|
|
23
|
+
response = Net::HTTP.get_response(uri)
|
|
24
|
+
return {} unless response.is_a?(Net::HTTPSuccess)
|
|
25
|
+
|
|
26
|
+
data = JSON.parse(response.body)
|
|
27
|
+
languages = {}
|
|
28
|
+
|
|
29
|
+
data["sitematrix"].each do |key, val|
|
|
30
|
+
next unless key.match?(/^\d+$/) && val.is_a?(Hash) && val["site"]
|
|
31
|
+
|
|
32
|
+
# Find Wikipedia site info
|
|
33
|
+
wiki_site = val["site"].find { |site| site["code"] == "wiki" }
|
|
34
|
+
next unless wiki_site
|
|
35
|
+
|
|
36
|
+
lang_code = val["code"]
|
|
37
|
+
languages[lang_code] = {
|
|
38
|
+
"name" => val["name"],
|
|
39
|
+
"localname" => val["localname"],
|
|
40
|
+
"url" => wiki_site["url"],
|
|
41
|
+
"dbname" => wiki_site["dbname"],
|
|
42
|
+
"closed" => wiki_site["closed"] || false,
|
|
43
|
+
"private" => wiki_site["private"] || false
|
|
44
|
+
}
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
languages
|
|
48
|
+
rescue StandardError => e
|
|
49
|
+
warn "Error fetching sitematrix: #{e.message}"
|
|
50
|
+
{}
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Fetch article statistics for a specific Wikipedia
|
|
54
|
+
def fetch_wiki_statistics(lang_code)
|
|
55
|
+
uri = URI("https://#{lang_code}.wikipedia.org/w/api.php")
|
|
56
|
+
params = {
|
|
57
|
+
action: "query",
|
|
58
|
+
meta: "siteinfo",
|
|
59
|
+
siprop: "statistics",
|
|
60
|
+
format: "json"
|
|
61
|
+
}
|
|
62
|
+
uri.query = URI.encode_www_form(params)
|
|
63
|
+
|
|
64
|
+
response = Net::HTTP.get_response(uri)
|
|
65
|
+
return nil unless response.is_a?(Net::HTTPSuccess)
|
|
66
|
+
|
|
67
|
+
data = JSON.parse(response.body)
|
|
68
|
+
stats = data.dig("query", "statistics")
|
|
69
|
+
return nil unless stats
|
|
70
|
+
|
|
71
|
+
{
|
|
72
|
+
"articles" => stats["articles"],
|
|
73
|
+
"pages" => stats["pages"],
|
|
74
|
+
"edits" => stats["edits"],
|
|
75
|
+
"users" => stats["users"],
|
|
76
|
+
"activeusers" => stats["activeusers"]
|
|
77
|
+
}
|
|
78
|
+
rescue StandardError
|
|
79
|
+
nil
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def main
|
|
83
|
+
puts "Fetching Wikipedia language list..."
|
|
84
|
+
languages = fetch_wikipedia_languages
|
|
85
|
+
|
|
86
|
+
if languages.empty?
|
|
87
|
+
warn "Failed to fetch language list. Aborting."
|
|
88
|
+
exit 1
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Filter out closed/private wikis
|
|
92
|
+
active_languages = languages.reject { |_, info| info["closed"] || info["private"] }
|
|
93
|
+
puts "Found #{active_languages.size} active Wikipedia editions."
|
|
94
|
+
|
|
95
|
+
puts "Fetching statistics for each Wikipedia (this may take a few minutes)..."
|
|
96
|
+
successful = 0
|
|
97
|
+
failed = []
|
|
98
|
+
|
|
99
|
+
active_languages.each_with_index do |(lang_code, info), idx|
|
|
100
|
+
print "\r Processing: #{lang_code.ljust(10)} (#{idx + 1}/#{active_languages.size})"
|
|
101
|
+
$stdout.flush
|
|
102
|
+
|
|
103
|
+
stats = fetch_wiki_statistics(lang_code)
|
|
104
|
+
if stats
|
|
105
|
+
info.merge!(stats)
|
|
106
|
+
successful += 1
|
|
107
|
+
else
|
|
108
|
+
failed << lang_code
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
sleep 0.05 # Rate limiting
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
puts "\n Successfully fetched: #{successful}/#{active_languages.size}"
|
|
115
|
+
puts " Failed: #{failed.size} (#{failed.first(10).join(', ')}#{failed.size > 10 ? '...' : ''})" if failed.any?
|
|
116
|
+
|
|
117
|
+
# Categorize by size
|
|
118
|
+
size_categories = {
|
|
119
|
+
"large" => [], # 1M+ articles
|
|
120
|
+
"medium" => [], # 100K-1M articles
|
|
121
|
+
"small" => [], # 10K-100K articles
|
|
122
|
+
"mini" => [] # <10K articles
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
active_languages.each do |lang_code, info|
|
|
126
|
+
articles = info["articles"] || 0
|
|
127
|
+
category = if articles >= 1_000_000
|
|
128
|
+
"large"
|
|
129
|
+
elsif articles >= 100_000
|
|
130
|
+
"medium"
|
|
131
|
+
elsif articles >= 10_000
|
|
132
|
+
"small"
|
|
133
|
+
else
|
|
134
|
+
"mini"
|
|
135
|
+
end
|
|
136
|
+
size_categories[category] << lang_code
|
|
137
|
+
info["size_category"] = category
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# Build result
|
|
141
|
+
result = {
|
|
142
|
+
"meta" => {
|
|
143
|
+
"generated_at" => Time.now.utc.iso8601,
|
|
144
|
+
"source" => "Wikimedia sitematrix + siteinfo APIs",
|
|
145
|
+
"total_languages" => active_languages.size,
|
|
146
|
+
"statistics_fetched" => successful
|
|
147
|
+
},
|
|
148
|
+
"size_summary" => {
|
|
149
|
+
"large" => size_categories["large"].size,
|
|
150
|
+
"medium" => size_categories["medium"].size,
|
|
151
|
+
"small" => size_categories["small"].size,
|
|
152
|
+
"mini" => size_categories["mini"].size
|
|
153
|
+
},
|
|
154
|
+
"languages" => active_languages.sort_by { |_, info| -(info["articles"] || 0) }.to_h
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
# Write output
|
|
158
|
+
output_path = File.join(__dir__, "..", "lib", "wp2txt", "data", "language_metadata.json")
|
|
159
|
+
FileUtils.mkdir_p(File.dirname(output_path))
|
|
160
|
+
|
|
161
|
+
File.write(output_path, JSON.pretty_generate(result))
|
|
162
|
+
puts "\nData written to: #{output_path}"
|
|
163
|
+
|
|
164
|
+
# Summary
|
|
165
|
+
puts "\n=== Summary ==="
|
|
166
|
+
puts "Total active Wikipedias: #{active_languages.size}"
|
|
167
|
+
puts "Size categories:"
|
|
168
|
+
puts " Large (1M+ articles): #{size_categories['large'].size} - #{size_categories['large'].first(5).join(', ')}..."
|
|
169
|
+
puts " Medium (100K-1M): #{size_categories['medium'].size}"
|
|
170
|
+
puts " Small (10K-100K): #{size_categories['small'].size}"
|
|
171
|
+
puts " Mini (<10K): #{size_categories['mini'].size}"
|
|
172
|
+
|
|
173
|
+
# Top 20 by article count
|
|
174
|
+
puts "\nTop 20 Wikipedias by article count:"
|
|
175
|
+
active_languages.sort_by { |_, info| -(info["articles"] || 0) }.first(20).each_with_index do |(code, info), idx|
|
|
176
|
+
puts " #{(idx + 1).to_s.rjust(2)}. #{code.ljust(5)} #{info['name'].to_s.ljust(20)} #{(info['articles'] || 0).to_s.rjust(10)} articles"
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
main if __FILE__ == $PROGRAM_NAME
|