wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "etc"
|
|
4
|
+
|
|
5
|
+
module Wp2txt
|
|
6
|
+
# Memory monitoring and adaptive buffer sizing for streaming operations
|
|
7
|
+
# Provides utilities to track memory usage and dynamically adjust buffer sizes
|
|
8
|
+
module MemoryMonitor
|
|
9
|
+
# Default memory thresholds
|
|
10
|
+
LOW_MEMORY_THRESHOLD_MB = 256
|
|
11
|
+
HIGH_MEMORY_THRESHOLD_MB = 1024
|
|
12
|
+
TARGET_MEMORY_USAGE_PERCENT = 70
|
|
13
|
+
|
|
14
|
+
# Buffer size bounds
|
|
15
|
+
MIN_BUFFER_SIZE = 1_048_576 # 1 MB minimum
|
|
16
|
+
MAX_BUFFER_SIZE = 104_857_600 # 100 MB maximum
|
|
17
|
+
DEFAULT_BUFFER_SIZE = 10_485_760 # 10 MB default
|
|
18
|
+
|
|
19
|
+
module_function
|
|
20
|
+
|
|
21
|
+
# Get current process memory usage in bytes
|
|
22
|
+
# @return [Integer] Memory usage in bytes, or 0 if unavailable
|
|
23
|
+
def current_memory_usage
|
|
24
|
+
if Gem.win_platform?
|
|
25
|
+
# Windows: use tasklist (less reliable)
|
|
26
|
+
begin
|
|
27
|
+
output = IO.popen(["tasklist", "/FI", "PID eq #{Process.pid}", "/FO", "CSV", "/NH"], err: File::NULL, &:read)
|
|
28
|
+
# Parse CSV format: "process.exe","PID","Session","Session#","Mem Usage"
|
|
29
|
+
if output =~ /(\d[\d,]*)\s*K/
|
|
30
|
+
return $1.delete(",").to_i * 1024
|
|
31
|
+
end
|
|
32
|
+
rescue StandardError
|
|
33
|
+
return 0
|
|
34
|
+
end
|
|
35
|
+
else
|
|
36
|
+
# Unix: use /proc or ps
|
|
37
|
+
if File.exist?("/proc/#{Process.pid}/status")
|
|
38
|
+
# Linux: read from /proc
|
|
39
|
+
File.read("/proc/#{Process.pid}/status").each_line do |line|
|
|
40
|
+
if line =~ /^VmRSS:\s*(\d+)\s*kB/
|
|
41
|
+
return $1.to_i * 1024
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
else
|
|
45
|
+
# macOS/BSD: use ps
|
|
46
|
+
begin
|
|
47
|
+
output = IO.popen(["ps", "-o", "rss=", "-p", Process.pid.to_s], err: File::NULL, &:read)
|
|
48
|
+
return output.strip.to_i * 1024 unless output.strip.empty?
|
|
49
|
+
rescue StandardError
|
|
50
|
+
return 0
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
0
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Get total system memory in bytes
|
|
58
|
+
# @return [Integer] Total memory in bytes, or default if unavailable
|
|
59
|
+
def total_system_memory
|
|
60
|
+
if Gem.win_platform?
|
|
61
|
+
# Windows: use wmic
|
|
62
|
+
begin
|
|
63
|
+
output = IO.popen(["wmic", "computersystem", "get", "TotalPhysicalMemory"], err: File::NULL, &:read)
|
|
64
|
+
if output =~ /(\d+)/
|
|
65
|
+
return $1.to_i
|
|
66
|
+
end
|
|
67
|
+
rescue StandardError
|
|
68
|
+
return 4 * 1024 * 1024 * 1024 # Default 4 GB
|
|
69
|
+
end
|
|
70
|
+
elsif File.exist?("/proc/meminfo")
|
|
71
|
+
# Linux
|
|
72
|
+
File.read("/proc/meminfo").each_line do |line|
|
|
73
|
+
if line =~ /^MemTotal:\s*(\d+)\s*kB/
|
|
74
|
+
return $1.to_i * 1024
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
else
|
|
78
|
+
# macOS: use sysctl
|
|
79
|
+
begin
|
|
80
|
+
output = IO.popen(["sysctl", "-n", "hw.memsize"], err: File::NULL, &:read)
|
|
81
|
+
return output.strip.to_i unless output.strip.empty?
|
|
82
|
+
rescue StandardError
|
|
83
|
+
return 4 * 1024 * 1024 * 1024 # Default 4 GB
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
4 * 1024 * 1024 * 1024 # Default 4 GB
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Get available (free) memory in bytes
|
|
90
|
+
# @return [Integer] Available memory in bytes
|
|
91
|
+
def available_memory
|
|
92
|
+
if File.exist?("/proc/meminfo")
|
|
93
|
+
# Linux: read MemAvailable or estimate from MemFree + Buffers + Cached
|
|
94
|
+
meminfo = File.read("/proc/meminfo")
|
|
95
|
+
if meminfo =~ /^MemAvailable:\s*(\d+)\s*kB/
|
|
96
|
+
return $1.to_i * 1024
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
free = buffers = cached = 0
|
|
100
|
+
meminfo.each_line do |line|
|
|
101
|
+
case line
|
|
102
|
+
when /^MemFree:\s*(\d+)\s*kB/
|
|
103
|
+
free = $1.to_i * 1024
|
|
104
|
+
when /^Buffers:\s*(\d+)\s*kB/
|
|
105
|
+
buffers = $1.to_i * 1024
|
|
106
|
+
when /^Cached:\s*(\d+)\s*kB/
|
|
107
|
+
cached = $1.to_i * 1024
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
return free + buffers + cached
|
|
111
|
+
else
|
|
112
|
+
# macOS/other: estimate as total - current usage
|
|
113
|
+
total_system_memory - current_memory_usage
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Calculate memory usage percentage
|
|
118
|
+
# @return [Float] Percentage of memory used (0-100)
|
|
119
|
+
def memory_usage_percent
|
|
120
|
+
total = total_system_memory
|
|
121
|
+
return 0.0 if total.zero?
|
|
122
|
+
|
|
123
|
+
(current_memory_usage.to_f / total * 100).round(2)
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Determine if memory is running low
|
|
127
|
+
# @return [Boolean] true if memory usage is high
|
|
128
|
+
def memory_low?
|
|
129
|
+
available = available_memory / (1024 * 1024) # Convert to MB
|
|
130
|
+
available < LOW_MEMORY_THRESHOLD_MB
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# Calculate optimal buffer size based on available memory
|
|
134
|
+
# @param target_percent [Integer] Target memory usage percentage (default: 70%)
|
|
135
|
+
# @return [Integer] Recommended buffer size in bytes
|
|
136
|
+
def optimal_buffer_size(target_percent: TARGET_MEMORY_USAGE_PERCENT)
|
|
137
|
+
available = available_memory
|
|
138
|
+
|
|
139
|
+
# Use a fraction of available memory for buffering
|
|
140
|
+
# Conservative: use only 10% of available memory for buffer
|
|
141
|
+
target_buffer = (available * 0.10).to_i
|
|
142
|
+
|
|
143
|
+
# Clamp to reasonable bounds
|
|
144
|
+
target_buffer = MIN_BUFFER_SIZE if target_buffer < MIN_BUFFER_SIZE
|
|
145
|
+
target_buffer = MAX_BUFFER_SIZE if target_buffer > MAX_BUFFER_SIZE
|
|
146
|
+
|
|
147
|
+
# Round to nearest MB for cleaner allocation
|
|
148
|
+
((target_buffer / 1_048_576.0).round * 1_048_576).to_i
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
# Get a summary of current memory status
|
|
152
|
+
# @return [Hash] Memory statistics
|
|
153
|
+
def memory_stats
|
|
154
|
+
{
|
|
155
|
+
current_usage_mb: (current_memory_usage / 1_048_576.0).round(2),
|
|
156
|
+
total_system_mb: (total_system_memory / 1_048_576.0).round(2),
|
|
157
|
+
available_mb: (available_memory / 1_048_576.0).round(2),
|
|
158
|
+
usage_percent: memory_usage_percent,
|
|
159
|
+
recommended_buffer_mb: (optimal_buffer_size / 1_048_576.0).round(2),
|
|
160
|
+
low_memory: memory_low?
|
|
161
|
+
}
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# Format memory size for display
|
|
165
|
+
# @param bytes [Integer] Size in bytes
|
|
166
|
+
# @return [String] Human-readable size
|
|
167
|
+
def format_memory(bytes)
|
|
168
|
+
if bytes < 1024
|
|
169
|
+
"#{bytes} B"
|
|
170
|
+
elsif bytes < 1_048_576
|
|
171
|
+
"#{(bytes / 1024.0).round(1)} KB"
|
|
172
|
+
elsif bytes < 1_073_741_824
|
|
173
|
+
"#{(bytes / 1_048_576.0).round(1)} MB"
|
|
174
|
+
else
|
|
175
|
+
"#{(bytes / 1_073_741_824.0).round(2)} GB"
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
# Run garbage collection if memory is low
|
|
180
|
+
# @return [Boolean] true if GC was triggered
|
|
181
|
+
def gc_if_needed
|
|
182
|
+
if memory_low?
|
|
183
|
+
GC.start
|
|
184
|
+
true
|
|
185
|
+
else
|
|
186
|
+
false
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
# Memory required per parallel process (estimated)
|
|
191
|
+
MEMORY_PER_PROCESS_MB = 300
|
|
192
|
+
|
|
193
|
+
# Calculate optimal number of parallel processes based on CPU and memory
|
|
194
|
+
# @param memory_per_process_mb [Integer] Estimated memory per process in MB
|
|
195
|
+
# @return [Integer] Recommended number of parallel processes
|
|
196
|
+
def optimal_processes(memory_per_process_mb: MEMORY_PER_PROCESS_MB)
|
|
197
|
+
cores = Etc.nprocessors
|
|
198
|
+
|
|
199
|
+
# CPU-based calculation (scale based on core count)
|
|
200
|
+
cpu_based = case cores
|
|
201
|
+
when 1..4
|
|
202
|
+
[cores - 1, 1].max
|
|
203
|
+
when 5..8
|
|
204
|
+
cores - 2
|
|
205
|
+
else
|
|
206
|
+
# Large systems: use 75% of cores
|
|
207
|
+
(cores * 0.75).to_i
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
# Memory-based limit
|
|
211
|
+
available_mb = available_memory / (1024 * 1024)
|
|
212
|
+
memory_based = (available_mb / memory_per_process_mb).to_i
|
|
213
|
+
|
|
214
|
+
# Use the smaller of CPU and memory limits, minimum 1
|
|
215
|
+
result = [cpu_based, memory_based].min
|
|
216
|
+
[result, 1].max
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
# Get system info for parallel processing decisions
|
|
220
|
+
# @return [Hash] System information
|
|
221
|
+
def parallel_processing_info
|
|
222
|
+
cores = Etc.nprocessors
|
|
223
|
+
available_mb = (available_memory / 1_048_576.0).round(0)
|
|
224
|
+
optimal = optimal_processes
|
|
225
|
+
|
|
226
|
+
{
|
|
227
|
+
cpu_cores: cores,
|
|
228
|
+
available_memory_mb: available_mb,
|
|
229
|
+
memory_per_process_mb: MEMORY_PER_PROCESS_MB,
|
|
230
|
+
optimal_processes: optimal,
|
|
231
|
+
max_by_cpu: cores,
|
|
232
|
+
max_by_memory: (available_mb / MEMORY_PER_PROCESS_MB).to_i
|
|
233
|
+
}
|
|
234
|
+
end
|
|
235
|
+
end
|
|
236
|
+
end
|