wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
data/lib/wp2txt/cli.rb ADDED
@@ -0,0 +1,319 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "optimist"
4
+ require_relative "version"
5
+ require_relative "multistream"
6
+ require_relative "config"
7
+ require_relative "memory_monitor"
8
+
9
+ module Wp2txt
10
+ # CLI option parsing and validation
11
+ module CLI
12
+ class << self
13
+ # Get the optimal number of processors for this system
14
+ # Based on CPU cores and available memory
15
+ # @return [Integer] Optimal number of parallel processes
16
+ def max_processors
17
+ MemoryMonitor.optimal_processes
18
+ end
19
+
20
+ # Load configuration
21
+ # @return [Config] Configuration object
22
+ def load_config(config_path = nil)
23
+ path = config_path || Config.default_path
24
+ @config = Config.load(path)
25
+ end
26
+
27
+ # Get current config (lazy load)
28
+ # @return [Config] Configuration object
29
+ def config
30
+ @config ||= load_config
31
+ end
32
+
33
+ # Parse command line options
34
+ # @param args [Array<String>] Command line arguments
35
+ # @return [Hash] Parsed options
36
+ def parse_options(args)
37
+ # Pre-load config for defaults
38
+ cfg = config
39
+
40
+ opts = Optimist.options(args) do
41
+ version Wp2txt::VERSION
42
+ banner <<~BANNER
43
+ WP2TXT extracts plain text data from Wikipedia dump files.
44
+
45
+ Usage:
46
+ wp2txt --lang=ja [options] # Auto-download and process
47
+ wp2txt --input=FILE [options] # Process local file
48
+ wp2txt --lang=en --from-category="Cities" -o ./output # Extract category
49
+ wp2txt --lang=en --from-category="Cities" --dry-run # Preview only
50
+ wp2txt --cache-status # Show cache status
51
+ wp2txt --cache-clear [--lang=CODE] # Clear cache
52
+ wp2txt --config-init # Create default config file
53
+
54
+ Options:
55
+ BANNER
56
+
57
+ # Input source (one of --input or --lang required, unless cache operations)
58
+ opt :input, "Path to compressed file (bz2) or XML file",
59
+ type: String, short: "-i"
60
+ opt :lang, "Wikipedia language code (e.g., ja, en, de) for auto-download",
61
+ type: String, short: "-L"
62
+ opt :articles, "Specific article titles to extract (comma-separated, requires --lang)",
63
+ type: String, short: "-A"
64
+ opt :from_category, "Extract articles from Wikipedia category (requires --lang)",
65
+ type: String, short: "-G"
66
+ opt :depth, "Subcategory recursion depth for --from-category (0 = no recursion)",
67
+ default: cfg.default_depth, type: Integer, short: "-D"
68
+ opt :yes, "Skip confirmation prompt for category extraction",
69
+ default: false, short: "-y"
70
+ opt :dry_run, "Preview category extraction without downloading",
71
+ default: false
72
+ opt :update_cache, "Force refresh of cached dump files",
73
+ default: false, short: "-U"
74
+
75
+ # Output options
76
+ opt :output_dir, "Path to output directory",
77
+ default: Dir.pwd, type: String, short: "-o"
78
+ opt :format, "Output format: text or json (JSONL)",
79
+ default: cfg.default_format, short: "-j"
80
+
81
+ # Cache management
82
+ opt :cache_dir, "Cache directory for downloaded dumps",
83
+ default: cfg.cache_directory, type: String
84
+ opt :cache_status, "Show cache status and exit",
85
+ default: false
86
+ opt :cache_clear, "Clear cache and exit",
87
+ default: false
88
+ opt :config_init, "Create default configuration file (~/.wp2txt/config.yml)",
89
+ default: false
90
+ opt :config_path, "Path to configuration file",
91
+ type: String
92
+
93
+ # Processing options
94
+ opt :category, "Show article category information",
95
+ default: true, short: "-a"
96
+ opt :category_only, "Extract only article title and categories",
97
+ default: false, short: "-g"
98
+ opt :summary_only, "Extract only title, categories, and summary",
99
+ default: false, short: "-s"
100
+ opt :metadata_only, "Extract only title, section headings, and categories (for analysis)",
101
+ default: false, short: "-M"
102
+
103
+ # Section extraction options
104
+ opt :sections, "Extract specific sections (comma-separated, 'summary' for lead text)",
105
+ type: String, short: "-S"
106
+ opt :section_output, "Section output mode: 'structured' (default) or 'combined'",
107
+ default: "structured"
108
+ opt :min_section_length, "Minimum section length in characters (shorter sections become null)",
109
+ default: 0, type: Integer
110
+ opt :skip_empty, "Skip articles with no matching sections",
111
+ default: false
112
+ opt :alias_file, "Custom section alias definitions file (YAML format)",
113
+ type: String
114
+ opt :no_section_aliases, "Disable section alias matching (exact match only)",
115
+ default: false
116
+ opt :section_stats, "Collect and output section heading statistics (JSON)",
117
+ default: false
118
+ opt :show_matched_sections, "Include matched_sections field in JSON output (shows actual headings)",
119
+ default: false
120
+
121
+ opt :file_size, "Approximate size (in MB) of each output file (0 for single file)",
122
+ default: 10, short: "-f"
123
+ opt :num_procs, "Number of parallel processes (auto-detected based on CPU/memory)",
124
+ type: Integer, short: "-n"
125
+ opt :title, "Keep page titles in output",
126
+ default: true, short: "-t"
127
+ opt :heading, "Keep section titles in output",
128
+ default: true, short: "-d"
129
+ opt :list, "Keep unprocessed list items in output",
130
+ default: false, short: "-l"
131
+ opt :table, "Keep wiki table content in output",
132
+ default: false
133
+ opt :pre, "Keep preformatted text blocks in output",
134
+ default: false, short: "-p"
135
+ opt :ref, "Keep reference notations [ref]...[/ref]",
136
+ default: false, short: "-r"
137
+ opt :multiline, "Keep multi-line templates in output",
138
+ default: false
139
+ opt :redirect, "Show redirect destination",
140
+ default: false, short: "-e"
141
+ opt :marker, "Show symbols prefixed to list items",
142
+ default: true, short: "-m"
143
+ opt :markers, "Content type markers (math,code,chem,table,score,timeline,graph,ipa or 'all')",
144
+ default: "all", short: "-k"
145
+ opt :extract_citations, "Extract formatted citations instead of removing them",
146
+ default: false, short: "-C"
147
+ opt :expand_templates, "Expand common templates (birth date, convert, etc.) to readable text",
148
+ default: true, short: "-E"
149
+ opt :bz2_gem, "Use Ruby's bzip2-ruby gem instead of system command",
150
+ default: false, short: "-b"
151
+ opt :ractor, "Use Ractor for parallel processing (Ruby 4.0+, streaming mode only)",
152
+ default: false, short: "-R"
153
+ opt :no_turbo, "Disable turbo mode (use streaming instead, saves disk space)",
154
+ default: false
155
+
156
+ # Output control
157
+ opt :quiet, "Suppress progress output (only show errors and final result)",
158
+ default: false, short: "-q"
159
+ opt :no_color, "Disable colored output (also respects NO_COLOR env variable)",
160
+ default: false
161
+
162
+ # Deprecated options
163
+ opt :convert, "[DEPRECATED] No longer needed",
164
+ default: true, short: "-c"
165
+ opt :del_interfile, "[DEPRECATED] Intermediate files no longer created",
166
+ default: false, short: "-x"
167
+ end
168
+
169
+ validate_options!(opts)
170
+ opts
171
+ end
172
+
173
+ # Validate parsed options
174
+ def validate_options!(opts)
175
+ # Cache and config operations don't need input/lang
176
+ return if opts[:cache_status] || opts[:cache_clear] || opts[:config_init]
177
+
178
+ # Either --input or --lang is required
179
+ if opts[:input].nil? && opts[:lang].nil?
180
+ Optimist.die "Either --input or --lang is required"
181
+ end
182
+
183
+ # Cannot specify both --input and --lang
184
+ if opts[:input] && opts[:lang]
185
+ Optimist.die "Cannot specify both --input and --lang"
186
+ end
187
+
188
+ # --articles requires --lang
189
+ if opts[:articles] && opts[:lang].nil?
190
+ Optimist.die "--articles requires --lang"
191
+ end
192
+
193
+ # --articles cannot be used with --input
194
+ if opts[:articles] && opts[:input]
195
+ Optimist.die "--articles cannot be used with --input"
196
+ end
197
+
198
+ # --from-category requires --lang
199
+ if opts[:from_category] && opts[:lang].nil?
200
+ Optimist.die "--from-category requires --lang"
201
+ end
202
+
203
+ # --from-category cannot be used with --input
204
+ if opts[:from_category] && opts[:input]
205
+ Optimist.die "--from-category cannot be used with --input"
206
+ end
207
+
208
+ # --from-category cannot be used with --articles
209
+ if opts[:from_category] && opts[:articles]
210
+ Optimist.die "--from-category cannot be used with --articles"
211
+ end
212
+
213
+ # --depth must be >= 0
214
+ if opts[:depth] < 0
215
+ Optimist.die :depth, "must be 0 or greater"
216
+ end
217
+
218
+ # Warn if depth > 3 (can result in many articles)
219
+ if opts[:depth] > 3
220
+ warn "Warning: --depth > 3 may result in a very large number of articles"
221
+ end
222
+
223
+ # --dry-run only makes sense with --from-category
224
+ if opts[:dry_run] && opts[:from_category].nil?
225
+ Optimist.die "--dry-run requires --from-category"
226
+ end
227
+
228
+ # --yes only makes sense with --from-category
229
+ if opts[:yes] && opts[:from_category].nil?
230
+ Optimist.die "--yes requires --from-category"
231
+ end
232
+
233
+ # Validate --input exists
234
+ if opts[:input] && !File.exist?(opts[:input])
235
+ Optimist.die :input, "file does not exist"
236
+ end
237
+
238
+ # Validate language code
239
+ if opts[:lang] && !valid_language_code?(opts[:lang])
240
+ Optimist.die :lang, "invalid language code format"
241
+ end
242
+
243
+ # Validate output directory exists
244
+ unless File.exist?(opts[:output_dir])
245
+ Optimist.die :output_dir, "directory does not exist"
246
+ end
247
+
248
+ # Validate format
249
+ unless %w[text json].include?(opts[:format].to_s.downcase)
250
+ Optimist.die :format, "must be 'text' or 'json'"
251
+ end
252
+
253
+ # Validate file_size
254
+ Optimist.die :file_size, "must be 0 or larger" if opts[:file_size] < 0
255
+
256
+ # Validate --alias-file exists and is valid YAML
257
+ if opts[:alias_file]
258
+ unless File.exist?(opts[:alias_file])
259
+ Optimist.die :alias_file, "file does not exist"
260
+ end
261
+ begin
262
+ require "yaml"
263
+ YAML.safe_load(File.read(opts[:alias_file]), permitted_classes: [Symbol])
264
+ rescue Psych::SyntaxError => e
265
+ Optimist.die :alias_file, "invalid YAML syntax: #{e.message}"
266
+ end
267
+ end
268
+
269
+ # Extraction modes are mutually exclusive
270
+ extraction_modes = []
271
+ extraction_modes << "--category-only" if opts[:category_only]
272
+ extraction_modes << "--summary-only" if opts[:summary_only]
273
+ extraction_modes << "--metadata-only" if opts[:metadata_only]
274
+ if extraction_modes.size > 1
275
+ Optimist.die "#{extraction_modes.join(', ')} cannot be combined (choose one extraction mode)"
276
+ end
277
+
278
+ # --sections conflicts with extraction modes
279
+ if opts[:sections] && extraction_modes.any?
280
+ Optimist.die "--sections cannot be used with #{extraction_modes.first}"
281
+ end
282
+
283
+ # --section-stats is a standalone mode
284
+ if opts[:section_stats]
285
+ if opts[:sections]
286
+ Optimist.die "--section-stats cannot be used with --sections"
287
+ end
288
+ if extraction_modes.any?
289
+ Optimist.die "--section-stats cannot be used with #{extraction_modes.first}"
290
+ end
291
+ end
292
+
293
+ # --show-matched-sections only works with JSON format
294
+ if opts[:show_matched_sections] && opts[:format].to_s.downcase != "json"
295
+ Optimist.die "--show-matched-sections requires --format json"
296
+ end
297
+ end
298
+
299
+ # Parse article list from comma-separated string
300
+ def parse_article_list(articles_str)
301
+ return [] if articles_str.nil? || articles_str.empty?
302
+ articles_str.split(",").map(&:strip).reject(&:empty?)
303
+ end
304
+
305
+ # Check if a language code is valid
306
+ # Valid codes: 2-10 lowercase letters (e.g., en, ja, simple, zh-yue)
307
+ def valid_language_code?(code)
308
+ return false if code.nil? || code.empty?
309
+ # Allow codes like: en, ja, zh, simple, zh-yue, etc.
310
+ code.match?(/\A[a-z]{2,10}(-[a-z]{2,10})?\z/)
311
+ end
312
+
313
+ # Get default cache directory
314
+ def default_cache_dir
315
+ Config::DEFAULT_CACHE_DIR
316
+ end
317
+ end
318
+ end
319
+ end