wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
@@ -0,0 +1,236 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "etc"
4
+
5
+ module Wp2txt
6
+ # Memory monitoring and adaptive buffer sizing for streaming operations
7
+ # Provides utilities to track memory usage and dynamically adjust buffer sizes
8
+ module MemoryMonitor
9
+ # Default memory thresholds
10
+ LOW_MEMORY_THRESHOLD_MB = 256
11
+ HIGH_MEMORY_THRESHOLD_MB = 1024
12
+ TARGET_MEMORY_USAGE_PERCENT = 70
13
+
14
+ # Buffer size bounds
15
+ MIN_BUFFER_SIZE = 1_048_576 # 1 MB minimum
16
+ MAX_BUFFER_SIZE = 104_857_600 # 100 MB maximum
17
+ DEFAULT_BUFFER_SIZE = 10_485_760 # 10 MB default
18
+
19
+ module_function
20
+
21
+ # Get current process memory usage in bytes
22
+ # @return [Integer] Memory usage in bytes, or 0 if unavailable
23
+ def current_memory_usage
24
+ if Gem.win_platform?
25
+ # Windows: use tasklist (less reliable)
26
+ begin
27
+ output = IO.popen(["tasklist", "/FI", "PID eq #{Process.pid}", "/FO", "CSV", "/NH"], err: File::NULL, &:read)
28
+ # Parse CSV format: "process.exe","PID","Session","Session#","Mem Usage"
29
+ if output =~ /(\d[\d,]*)\s*K/
30
+ return $1.delete(",").to_i * 1024
31
+ end
32
+ rescue StandardError
33
+ return 0
34
+ end
35
+ else
36
+ # Unix: use /proc or ps
37
+ if File.exist?("/proc/#{Process.pid}/status")
38
+ # Linux: read from /proc
39
+ File.read("/proc/#{Process.pid}/status").each_line do |line|
40
+ if line =~ /^VmRSS:\s*(\d+)\s*kB/
41
+ return $1.to_i * 1024
42
+ end
43
+ end
44
+ else
45
+ # macOS/BSD: use ps
46
+ begin
47
+ output = IO.popen(["ps", "-o", "rss=", "-p", Process.pid.to_s], err: File::NULL, &:read)
48
+ return output.strip.to_i * 1024 unless output.strip.empty?
49
+ rescue StandardError
50
+ return 0
51
+ end
52
+ end
53
+ end
54
+ 0
55
+ end
56
+
57
+ # Get total system memory in bytes
58
+ # @return [Integer] Total memory in bytes, or default if unavailable
59
+ def total_system_memory
60
+ if Gem.win_platform?
61
+ # Windows: use wmic
62
+ begin
63
+ output = IO.popen(["wmic", "computersystem", "get", "TotalPhysicalMemory"], err: File::NULL, &:read)
64
+ if output =~ /(\d+)/
65
+ return $1.to_i
66
+ end
67
+ rescue StandardError
68
+ return 4 * 1024 * 1024 * 1024 # Default 4 GB
69
+ end
70
+ elsif File.exist?("/proc/meminfo")
71
+ # Linux
72
+ File.read("/proc/meminfo").each_line do |line|
73
+ if line =~ /^MemTotal:\s*(\d+)\s*kB/
74
+ return $1.to_i * 1024
75
+ end
76
+ end
77
+ else
78
+ # macOS: use sysctl
79
+ begin
80
+ output = IO.popen(["sysctl", "-n", "hw.memsize"], err: File::NULL, &:read)
81
+ return output.strip.to_i unless output.strip.empty?
82
+ rescue StandardError
83
+ return 4 * 1024 * 1024 * 1024 # Default 4 GB
84
+ end
85
+ end
86
+ 4 * 1024 * 1024 * 1024 # Default 4 GB
87
+ end
88
+
89
+ # Get available (free) memory in bytes
90
+ # @return [Integer] Available memory in bytes
91
+ def available_memory
92
+ if File.exist?("/proc/meminfo")
93
+ # Linux: read MemAvailable or estimate from MemFree + Buffers + Cached
94
+ meminfo = File.read("/proc/meminfo")
95
+ if meminfo =~ /^MemAvailable:\s*(\d+)\s*kB/
96
+ return $1.to_i * 1024
97
+ end
98
+
99
+ free = buffers = cached = 0
100
+ meminfo.each_line do |line|
101
+ case line
102
+ when /^MemFree:\s*(\d+)\s*kB/
103
+ free = $1.to_i * 1024
104
+ when /^Buffers:\s*(\d+)\s*kB/
105
+ buffers = $1.to_i * 1024
106
+ when /^Cached:\s*(\d+)\s*kB/
107
+ cached = $1.to_i * 1024
108
+ end
109
+ end
110
+ return free + buffers + cached
111
+ else
112
+ # macOS/other: estimate as total - current usage
113
+ total_system_memory - current_memory_usage
114
+ end
115
+ end
116
+
117
+ # Calculate memory usage percentage
118
+ # @return [Float] Percentage of memory used (0-100)
119
+ def memory_usage_percent
120
+ total = total_system_memory
121
+ return 0.0 if total.zero?
122
+
123
+ (current_memory_usage.to_f / total * 100).round(2)
124
+ end
125
+
126
+ # Determine if memory is running low
127
+ # @return [Boolean] true if memory usage is high
128
+ def memory_low?
129
+ available = available_memory / (1024 * 1024) # Convert to MB
130
+ available < LOW_MEMORY_THRESHOLD_MB
131
+ end
132
+
133
+ # Calculate optimal buffer size based on available memory
134
+ # @param target_percent [Integer] Target memory usage percentage (default: 70%)
135
+ # @return [Integer] Recommended buffer size in bytes
136
+ def optimal_buffer_size(target_percent: TARGET_MEMORY_USAGE_PERCENT)
137
+ available = available_memory
138
+
139
+ # Use a fraction of available memory for buffering
140
+ # Conservative: use only 10% of available memory for buffer
141
+ target_buffer = (available * 0.10).to_i
142
+
143
+ # Clamp to reasonable bounds
144
+ target_buffer = MIN_BUFFER_SIZE if target_buffer < MIN_BUFFER_SIZE
145
+ target_buffer = MAX_BUFFER_SIZE if target_buffer > MAX_BUFFER_SIZE
146
+
147
+ # Round to nearest MB for cleaner allocation
148
+ ((target_buffer / 1_048_576.0).round * 1_048_576).to_i
149
+ end
150
+
151
+ # Get a summary of current memory status
152
+ # @return [Hash] Memory statistics
153
+ def memory_stats
154
+ {
155
+ current_usage_mb: (current_memory_usage / 1_048_576.0).round(2),
156
+ total_system_mb: (total_system_memory / 1_048_576.0).round(2),
157
+ available_mb: (available_memory / 1_048_576.0).round(2),
158
+ usage_percent: memory_usage_percent,
159
+ recommended_buffer_mb: (optimal_buffer_size / 1_048_576.0).round(2),
160
+ low_memory: memory_low?
161
+ }
162
+ end
163
+
164
+ # Format memory size for display
165
+ # @param bytes [Integer] Size in bytes
166
+ # @return [String] Human-readable size
167
+ def format_memory(bytes)
168
+ if bytes < 1024
169
+ "#{bytes} B"
170
+ elsif bytes < 1_048_576
171
+ "#{(bytes / 1024.0).round(1)} KB"
172
+ elsif bytes < 1_073_741_824
173
+ "#{(bytes / 1_048_576.0).round(1)} MB"
174
+ else
175
+ "#{(bytes / 1_073_741_824.0).round(2)} GB"
176
+ end
177
+ end
178
+
179
+ # Run garbage collection if memory is low
180
+ # @return [Boolean] true if GC was triggered
181
+ def gc_if_needed
182
+ if memory_low?
183
+ GC.start
184
+ true
185
+ else
186
+ false
187
+ end
188
+ end
189
+
190
+ # Memory required per parallel process (estimated)
191
+ MEMORY_PER_PROCESS_MB = 300
192
+
193
+ # Calculate optimal number of parallel processes based on CPU and memory
194
+ # @param memory_per_process_mb [Integer] Estimated memory per process in MB
195
+ # @return [Integer] Recommended number of parallel processes
196
+ def optimal_processes(memory_per_process_mb: MEMORY_PER_PROCESS_MB)
197
+ cores = Etc.nprocessors
198
+
199
+ # CPU-based calculation (scale based on core count)
200
+ cpu_based = case cores
201
+ when 1..4
202
+ [cores - 1, 1].max
203
+ when 5..8
204
+ cores - 2
205
+ else
206
+ # Large systems: use 75% of cores
207
+ (cores * 0.75).to_i
208
+ end
209
+
210
+ # Memory-based limit
211
+ available_mb = available_memory / (1024 * 1024)
212
+ memory_based = (available_mb / memory_per_process_mb).to_i
213
+
214
+ # Use the smaller of CPU and memory limits, minimum 1
215
+ result = [cpu_based, memory_based].min
216
+ [result, 1].max
217
+ end
218
+
219
+ # Get system info for parallel processing decisions
220
+ # @return [Hash] System information
221
+ def parallel_processing_info
222
+ cores = Etc.nprocessors
223
+ available_mb = (available_memory / 1_048_576.0).round(0)
224
+ optimal = optimal_processes
225
+
226
+ {
227
+ cpu_cores: cores,
228
+ available_memory_mb: available_mb,
229
+ memory_per_process_mb: MEMORY_PER_PROCESS_MB,
230
+ optimal_processes: optimal,
231
+ max_by_cpu: cores,
232
+ max_by_memory: (available_mb / MEMORY_PER_PROCESS_MB).to_i
233
+ }
234
+ end
235
+ end
236
+ end