wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
@@ -0,0 +1,545 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "multistream"
4
+ require_relative "cli"
5
+
6
+ module Wp2txt
7
+ # Article extraction utilities for WpApp
8
+ module Extractor
9
+ # Exit codes
10
+ EXIT_SUCCESS = 0
11
+ EXIT_ERROR = 1
12
+ EXIT_PARTIAL = 2
13
+
14
+ # Extract specific articles by title
15
+ def extract_specific_articles(opts)
16
+ lang = opts[:lang]
17
+ cache_dir = opts[:cache_dir]
18
+ article_titles = Wp2txt::CLI.parse_article_list(opts[:articles])
19
+ app_config = Wp2txt::CLI.config
20
+ force_update = opts[:update_cache]
21
+ total_steps = 4
22
+ start_time = Time.now
23
+
24
+ # Mode banner
25
+ articles_display = article_titles.size > 3 ? "#{article_titles.first(3).join(', ')}... (#{article_titles.size} total)" : article_titles.join(", ")
26
+ print_mode_banner("Article Extraction", {
27
+ "Language" => lang,
28
+ "Articles" => articles_display,
29
+ "Output" => opts[:output_dir]
30
+ })
31
+
32
+ # Create dump manager
33
+ manager = Wp2txt::DumpManager.new(
34
+ lang,
35
+ cache_dir: cache_dir,
36
+ dump_expiry_days: app_config.dump_expiry_days
37
+ )
38
+
39
+ # Step 1: Download index
40
+ print_header("Downloading index", step: 1, total_steps: total_steps)
41
+ index_path = manager.download_index(force: force_update)
42
+
43
+ # Step 2: Load index and find articles
44
+ print_header("Locating articles", step: 2, total_steps: total_steps)
45
+
46
+ # Use SQLite cache for fast repeated access
47
+ # Early termination: stop parsing when all target articles are found (if not using cache)
48
+ index = Wp2txt::MultistreamIndex.new(
49
+ index_path,
50
+ cache_dir: cache_dir,
51
+ target_titles: article_titles
52
+ )
53
+
54
+ if index.loaded_from_cache?
55
+ puts " Index loaded from cache (#{index.size} entries)"
56
+ elsif index.early_terminated?
57
+ puts " Early termination: found all #{article_titles.size} articles"
58
+ end
59
+ puts
60
+
61
+ # Find requested articles
62
+ found_articles = []
63
+ not_found = []
64
+
65
+ article_titles.each do |title|
66
+ entry = index.find_by_title(title)
67
+ if entry
68
+ found_articles << entry
69
+ print_list_item("#{title}", status: :success)
70
+ else
71
+ not_found << title
72
+ print_list_item("#{title} (not found)", status: :error)
73
+ end
74
+ end
75
+
76
+ if found_articles.empty?
77
+ puts unless quiet?
78
+ print_error("No articles found. Please check the titles.")
79
+ return EXIT_ERROR
80
+ end
81
+
82
+ # Step 3: Download streams
83
+ streams_needed = found_articles.map { |e| e[:offset] }.uniq.sort
84
+ print_header("Downloading data (#{streams_needed.size} streams)", step: 3, total_steps: total_steps)
85
+ multistream_path = download_partial_streams(manager, index, streams_needed, force: force_update)
86
+
87
+ # Create multistream reader, reusing the existing index (avoids double parsing)
88
+ reader = Wp2txt::MultistreamReader.new(multistream_path, index)
89
+
90
+ # Build config for processing
91
+ format = opts[:format].to_s.downcase.to_sym
92
+ config = build_extraction_config(opts, format)
93
+
94
+ # Create output writer
95
+ base_name = "#{lang}wiki_articles"
96
+ writer = OutputWriter.new(
97
+ output_dir: opts[:output_dir],
98
+ base_name: base_name,
99
+ format: format,
100
+ file_size_mb: opts[:file_size]
101
+ )
102
+
103
+ # Step 4: Extract articles
104
+ print_header("Extracting articles", step: 4, total_steps: total_steps)
105
+ extracted_count = 0
106
+ extraction_failures = []
107
+
108
+ found_articles.each do |entry|
109
+ title = entry[:title]
110
+ page = reader.extract_article(title)
111
+
112
+ if page
113
+ article = Article.new(page[:text], page[:title], !config[:marker])
114
+ result = format_article(article, config)
115
+ writer.write(result)
116
+ extracted_count += 1
117
+ print_list_item("#{title}", status: :success)
118
+ else
119
+ extraction_failures << title
120
+ print_list_item("#{title} (extraction failed)", status: :warning)
121
+ end
122
+ end
123
+
124
+ # Close output
125
+ output_files = writer.close
126
+ total_time = Time.now - start_time
127
+
128
+ # Summary
129
+ has_issues = not_found.any? || extraction_failures.any?
130
+ status = has_issues ? :warning : :success
131
+
132
+ print_summary("Extraction Complete", {
133
+ "Extracted" => "#{extracted_count}/#{article_titles.size}",
134
+ "Output files" => output_files.size.to_s,
135
+ "Total time" => format_duration(total_time)
136
+ }, status: status)
137
+
138
+ if not_found.any?
139
+ puts unless quiet?
140
+ print_warning("Not found in index (#{not_found.size}):")
141
+ not_found.each { |t| print_list_item(t, status: :error) }
142
+ end
143
+
144
+ puts unless quiet?
145
+ puts pastel.dim("Output files:") unless quiet?
146
+ output_files.each { |f| print_list_item(f, status: :success) }
147
+
148
+ # Return appropriate exit code
149
+ has_issues ? EXIT_PARTIAL : EXIT_SUCCESS
150
+ end
151
+
152
+ # Download only the streams containing the requested articles
153
+ # @param manager [DumpManager] The dump manager
154
+ # @param index [MultistreamIndex] The multistream index
155
+ # @param stream_offsets [Array<Integer>] Byte offsets of streams to download
156
+ # @param force [Boolean] Force re-download even if cached
157
+ def download_partial_streams(manager, index, stream_offsets, force: false)
158
+ # Calculate how many streams are needed
159
+ all_offsets = index.stream_offsets
160
+ max_offset_needed = stream_offsets.max
161
+
162
+ # Find the index of the highest needed stream
163
+ max_idx = all_offsets.index(max_offset_needed)
164
+ if max_idx.nil? || max_idx >= all_offsets.size - 1
165
+ # Need full file for last stream
166
+ return manager.download_multistream(force: force)
167
+ end
168
+
169
+ # Request partial download (DumpManager handles caching logic)
170
+ stream_count = max_idx + 1
171
+ manager.download_multistream(max_streams: stream_count, force: force)
172
+ end
173
+
174
+ # Show download estimate before confirmation
175
+ # @param manager [DumpManager] The dump manager
176
+ # @param app_config [Config] Application configuration
177
+ # @return [Boolean] True if cache is stale and user should be warned
178
+ def show_download_estimate(manager, app_config = nil)
179
+ puts pastel.dim("Download status:")
180
+
181
+ # Check index cache
182
+ index_path = manager.cached_index_path
183
+ index_cached = File.exist?(index_path)
184
+ cache_is_stale = false
185
+
186
+ if index_cached
187
+ index_size = format_size(File.size(index_path))
188
+ age_days = manager.cache_age_days
189
+ mtime = manager.cache_mtime
190
+ expiry_days = app_config&.dump_expiry_days || manager.dump_expiry_days
191
+
192
+ # Format cache date
193
+ cache_date_str = mtime ? mtime.strftime("%Y-%m-%d") : "unknown"
194
+
195
+ # Check if stale
196
+ cache_is_stale = age_days && age_days > expiry_days
197
+
198
+ if cache_is_stale
199
+ age_str = age_days >= 1 ? "#{age_days.round(0)} days ago" : "today"
200
+ print_list_item("Index: #{pastel.yellow('cached')} (#{index_size}, #{cache_date_str} - #{age_str})", status: :warning)
201
+ print_list_item(" Cache is older than #{expiry_days} days (recommended refresh)", status: :warning, indent: 2)
202
+ else
203
+ age_str = age_days && age_days >= 1 ? "#{age_days.round(0)} days ago" : "today"
204
+ print_list_item("Index: #{pastel.green('cached')} (#{index_size}, #{cache_date_str} - #{age_str})", status: :success)
205
+ end
206
+ else
207
+ print_list_item("Index: #{pastel.yellow('download required')}", status: :warning)
208
+ end
209
+
210
+ # Check multistream cache
211
+ full_path = manager.cached_multistream_path
212
+ full_cached = File.exist?(full_path)
213
+
214
+ if full_cached
215
+ dump_size = format_size(File.size(full_path))
216
+ dump_date_str = File.mtime(full_path).strftime("%Y-%m-%d")
217
+ dump_age = ((Time.now - File.mtime(full_path)) / 86400).round(0)
218
+ dump_age_str = dump_age >= 1 ? "#{dump_age} days ago" : "today"
219
+
220
+ if cache_is_stale
221
+ print_list_item("Dump: #{pastel.yellow('cached')} (#{dump_size}, #{dump_date_str} - #{dump_age_str})", status: :warning)
222
+ else
223
+ print_list_item("Dump: #{pastel.green('cached')} (#{dump_size}, #{dump_date_str} - #{dump_age_str})", status: :success)
224
+ end
225
+ else
226
+ # Check for partial downloads
227
+ partial = manager.find_suitable_partial_cache(1)
228
+ if partial
229
+ partial_size = format_size(File.size(partial))
230
+ partial_date = File.mtime(partial).strftime("%Y-%m-%d")
231
+ print_list_item("Dump: #{pastel.cyan('partial cached')} (#{partial_size}, #{partial_date})", status: :success)
232
+ print_list_item(" Additional download may be required depending on article locations", status: :pending)
233
+ else
234
+ print_list_item("Dump: #{pastel.yellow('download required')} (several GB)", status: :warning)
235
+ end
236
+ end
237
+
238
+ if cache_is_stale
239
+ puts
240
+ print_warning("Cache is stale. Use --update-cache to force refresh.")
241
+ end
242
+
243
+ puts
244
+ cache_is_stale
245
+ end
246
+
247
+ # Build config hash for article extraction
248
+ def build_extraction_config(opts, format)
249
+ config = {
250
+ format: format,
251
+ num_procs: 1, # Single-threaded for article extraction
252
+ file_size: opts[:file_size],
253
+ bz2_gem: opts[:bz2_gem]
254
+ }
255
+
256
+ %i[title list heading table redirect multiline category category_only
257
+ summary_only marker extract_citations].each do |opt|
258
+ config[opt] = opts[opt]
259
+ end
260
+
261
+ # Section extraction options
262
+ %i[sections section_output min_section_length skip_empty
263
+ alias_file no_section_aliases show_matched_sections].each do |opt|
264
+ config[opt] = opts[opt]
265
+ end
266
+
267
+ # Parse sections string into array if provided
268
+ if config[:sections].is_a?(String)
269
+ config[:sections] = config[:sections].split(",").map(&:strip).reject(&:empty?)
270
+ end
271
+
272
+ config[:markers] = parse_markers_option(opts[:markers])
273
+ config
274
+ end
275
+
276
+ # Extract articles from a Wikipedia category
277
+ def extract_category_articles(opts)
278
+ lang = opts[:lang]
279
+ category = opts[:from_category]
280
+ max_depth = opts[:depth]
281
+ cache_dir = opts[:cache_dir]
282
+ dry_run = opts[:dry_run]
283
+ skip_confirm = opts[:yes]
284
+ total_steps = 6
285
+ start_time = Time.now
286
+
287
+ # Mode banner
288
+ print_mode_banner("Category Extraction", {
289
+ "Language" => lang,
290
+ "Category" => category,
291
+ "Depth" => max_depth,
292
+ "Output" => opts[:output_dir]
293
+ })
294
+
295
+ # Get config values
296
+ app_config = Wp2txt::CLI.config
297
+
298
+ # Create category fetcher
299
+ fetcher = Wp2txt::CategoryFetcher.new(
300
+ lang, category,
301
+ max_depth: max_depth,
302
+ cache_expiry_days: app_config.category_expiry_days
303
+ )
304
+
305
+ # Step 1: Fetch preview
306
+ print_header("Scanning category", step: 1, total_steps: total_steps)
307
+ spinner = create_spinner("Fetching category information...")
308
+ spinner.auto_spin
309
+
310
+ begin
311
+ preview = fetcher.fetch_preview
312
+ spinner.success(pastel.green("Done!"))
313
+ rescue Net::OpenTimeout, Net::ReadTimeout, SocketError, Errno::ECONNREFUSED, OpenSSL::SSL::SSLError, JSON::ParserError, IOError => e
314
+ spinner.error(pastel.red("Failed!"))
315
+ print_error("Error fetching category: #{e.message}")
316
+ return EXIT_ERROR
317
+ end
318
+
319
+ # Display preview
320
+ print_subheader("Category Preview")
321
+ print_info("Category", preview[:category])
322
+ print_info("Depth", preview[:depth].to_s)
323
+ puts
324
+
325
+ if preview[:subcategories] && !preview[:subcategories].empty?
326
+ puts pastel.dim("Categories scanned:")
327
+ preview[:subcategories].each do |subcat|
328
+ print_list_item("#{subcat[:name]} (#{subcat[:article_count]} articles)")
329
+ end
330
+ puts
331
+ end
332
+
333
+ puts pastel.dim("Summary:")
334
+ print_info("Subcategories", (preview[:total_subcategories] || 0).to_s, indent: 1)
335
+ print_info("Total articles", pastel.bold(preview[:total_articles].to_s), indent: 1)
336
+ puts
337
+
338
+ # Check if there are any articles
339
+ if preview[:total_articles].zero?
340
+ print_warning("No articles found in this category.")
341
+ return EXIT_SUCCESS # Not an error, just empty category
342
+ end
343
+
344
+ # Warn about large extractions
345
+ if preview[:total_articles] > 1000
346
+ print_warning("Large category with #{preview[:total_articles]} articles.")
347
+ puts pastel.yellow(" Extraction may take a long time and require significant disk space.")
348
+ puts
349
+ end
350
+
351
+ # Show cache status before confirmation
352
+ temp_manager = Wp2txt::DumpManager.new(
353
+ lang,
354
+ cache_dir: cache_dir,
355
+ dump_expiry_days: app_config.dump_expiry_days
356
+ )
357
+ cache_stale = show_download_estimate(temp_manager, app_config)
358
+ force_update = opts[:update_cache]
359
+
360
+ # Dry run mode - exit here
361
+ if dry_run
362
+ print_info_message("Dry run mode - no articles will be extracted.")
363
+ return EXIT_SUCCESS
364
+ end
365
+
366
+ # Confirmation prompt
367
+ unless skip_confirm
368
+ unless $stdin.tty?
369
+ print_error("Interactive confirmation required.")
370
+ puts pastel.red(" Use --yes to skip confirmation when running non-interactively.")
371
+ return EXIT_ERROR
372
+ end
373
+
374
+ unless confirm?("Proceed with extraction?")
375
+ puts "Extraction cancelled."
376
+ return EXIT_SUCCESS # User chose to cancel
377
+ end
378
+ end
379
+
380
+ # Step 2: Fetch full article list
381
+ print_header("Fetching articles from API", step: 2, total_steps: total_steps)
382
+ spinner = create_spinner("Fetching article list...")
383
+ spinner.auto_spin
384
+
385
+ begin
386
+ article_titles = fetcher.fetch_articles
387
+ spinner.success(pastel.green("#{article_titles.size} articles"))
388
+ rescue Net::OpenTimeout, Net::ReadTimeout, SocketError, Errno::ECONNREFUSED, OpenSSL::SSL::SSLError, JSON::ParserError, IOError => e
389
+ spinner.error(pastel.red("Failed!"))
390
+ print_error("Error fetching articles: #{e.message}")
391
+ return EXIT_ERROR
392
+ end
393
+
394
+ if article_titles.empty?
395
+ print_warning("No articles to extract.")
396
+ return EXIT_SUCCESS # Not an error, just empty result
397
+ end
398
+
399
+ # Create dump manager
400
+ manager = Wp2txt::DumpManager.new(
401
+ lang,
402
+ cache_dir: cache_dir,
403
+ dump_expiry_days: app_config.dump_expiry_days
404
+ )
405
+
406
+ # Step 3: Download index
407
+ print_header("Downloading index", step: 3, total_steps: total_steps)
408
+ index_path = manager.download_index(force: force_update)
409
+
410
+ # Step 4: Load index and locate articles
411
+ print_header("Locating articles in dump", step: 4, total_steps: total_steps)
412
+
413
+ # Use SQLite cache for fast repeated access
414
+ index = Wp2txt::MultistreamIndex.new(index_path, cache_dir: cache_dir)
415
+
416
+ if index.loaded_from_cache?
417
+ puts " Index loaded from cache (#{index.size} entries)"
418
+ else
419
+ puts " Index parsed (#{index.size} entries)"
420
+ end
421
+
422
+ # Find articles in index
423
+ found_articles = []
424
+ not_found = []
425
+
426
+ article_titles.each do |title|
427
+ entry = index.find_by_title(title)
428
+ if entry
429
+ found_articles << entry
430
+ else
431
+ not_found << title
432
+ end
433
+ end
434
+
435
+ print_list_item("Found in dump: #{found_articles.size}", status: :success)
436
+ print_list_item("Not in dump: #{not_found.size}", status: not_found.any? ? :warning : :success) if not_found.any?
437
+
438
+ if found_articles.empty?
439
+ print_error("No articles found in dump. The dump may be out of date.")
440
+ return EXIT_ERROR
441
+ end
442
+
443
+ # Step 5: Download multistream
444
+ streams_needed = found_articles.map { |e| e[:offset] }.uniq.sort
445
+ print_header("Downloading data (#{streams_needed.size} streams)", step: 5, total_steps: total_steps)
446
+ multistream_path = download_partial_streams(manager, index, streams_needed, force: force_update)
447
+
448
+ # Create multistream reader, reusing the existing index (avoids double parsing)
449
+ reader = Wp2txt::MultistreamReader.new(multistream_path, index)
450
+
451
+ # Build config
452
+ format = opts[:format].to_s.downcase.to_sym
453
+ config = build_extraction_config(opts, format)
454
+
455
+ # Create output writer
456
+ base_name = "#{lang}wiki_#{sanitize_filename(category)}"
457
+ writer = OutputWriter.new(
458
+ output_dir: opts[:output_dir],
459
+ base_name: base_name,
460
+ format: format,
461
+ file_size_mb: opts[:file_size]
462
+ )
463
+
464
+ # Step 6: Extract and process articles
465
+ print_header("Extracting articles", step: 6, total_steps: total_steps)
466
+ total_count = found_articles.size
467
+ bar = create_progress_bar(" Processing", total_count)
468
+
469
+ extracted_count = 0
470
+ extraction_start = Time.now
471
+
472
+ # Use parallel extraction for large batches (>50 articles across multiple streams)
473
+ streams_count = found_articles.map { |e| e[:offset] }.uniq.size
474
+ use_parallel = total_count > 50 && streams_count > 1
475
+
476
+ if use_parallel
477
+ # Parallel extraction: process streams concurrently
478
+ num_procs = [streams_count, 4].min # Cap at 4 processes
479
+ pages = reader.each_article_parallel(found_articles, num_processes: num_procs).to_a
480
+
481
+ pages.each do |page|
482
+ article = Article.new(page[:text], page[:title], !config[:marker])
483
+ result = format_article(article, config)
484
+ writer.write(result)
485
+ extracted_count += 1
486
+ bar.advance
487
+ end
488
+ else
489
+ # Sequential extraction for small batches
490
+ found_articles.each do |entry|
491
+ title = entry[:title]
492
+ page = reader.extract_article(title)
493
+
494
+ if page
495
+ article = Article.new(page[:text], page[:title], !config[:marker])
496
+ result = format_article(article, config)
497
+ writer.write(result)
498
+ extracted_count += 1
499
+ end
500
+
501
+ bar.advance
502
+ end
503
+ end
504
+
505
+ bar.finish
506
+ extraction_time = Time.now - extraction_start
507
+
508
+ # Close output
509
+ output_files = writer.close
510
+
511
+ # Summary
512
+ total_time = Time.now - start_time
513
+ status = not_found.empty? ? :success : :warning
514
+
515
+ print_summary("Extraction Complete", {
516
+ "Articles extracted" => "#{extracted_count}/#{article_titles.size}",
517
+ "Output files" => output_files.size.to_s,
518
+ "Extraction time" => format_duration(extraction_time),
519
+ "Total time" => format_duration(total_time)
520
+ }, status: status)
521
+
522
+ if not_found.any?
523
+ puts unless quiet?
524
+ if not_found.size <= 10
525
+ print_warning("Not found in dump (#{not_found.size}):")
526
+ not_found.each { |t| print_list_item(t, status: :warning) }
527
+ else
528
+ print_warning("#{not_found.size} articles not found (may be newer than dump)")
529
+ end
530
+ end
531
+
532
+ puts unless quiet?
533
+ puts pastel.dim("Output files:") unless quiet?
534
+ output_files.each { |f| print_list_item(f, status: :success) }
535
+
536
+ # Return appropriate exit code
537
+ not_found.empty? ? EXIT_SUCCESS : EXIT_PARTIAL
538
+ end
539
+
540
+ # Sanitize category name for use in filename
541
+ def sanitize_filename(name)
542
+ name.gsub(%r{[/\\:*?"<>|]}, "_").gsub(/\s+/, "_").slice(0, 50)
543
+ end
544
+ end
545
+ end
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "find"
4
+
5
+ module Wp2txt
6
+ # File operation utilities
7
+
8
+ # Collect filenames recursively
9
+ def collect_files(str, regex = nil)
10
+ regex ||= //
11
+ text_array = []
12
+ Find.find(str) do |f|
13
+ text_array << f if regex =~ f
14
+ end
15
+ text_array.sort
16
+ end
17
+
18
+ # Modify a file using block/yield mechanism
19
+ def file_mod(file_path, backup = false)
20
+ str = File.read(file_path)
21
+ newstr = yield(str)
22
+ str = newstr unless newstr.nil?
23
+
24
+ require "tempfile"
25
+ dir = File.dirname(file_path)
26
+ temp = Tempfile.new(["wp2txt_", File.extname(file_path)], dir)
27
+ begin
28
+ temp.write(str)
29
+ temp.close
30
+ File.rename(file_path, file_path + ".bak")
31
+ File.rename(temp.path, file_path)
32
+ File.unlink(file_path + ".bak") unless backup
33
+ rescue StandardError
34
+ temp.close! rescue nil # rubocop:disable Style/RescueModifier
35
+ raise
36
+ end
37
+ end
38
+
39
+ # Modify files under a directory (recursive)
40
+ def batch_file_mod(dir_path)
41
+ if FileTest.directory?(dir_path)
42
+ collect_files(dir_path).each do |file|
43
+ yield file if FileTest.file?(file)
44
+ end
45
+ elsif FileTest.file?(dir_path)
46
+ yield dir_path
47
+ end
48
+ end
49
+
50
+ # Take care of difference of separators among environments
51
+ def correct_separator(input)
52
+ case input
53
+ when String
54
+ # Use tr instead of gsub for simple character replacement (faster)
55
+ if RUBY_PLATFORM.index("win32")
56
+ input.tr("/", "\\")
57
+ else
58
+ input.tr("\\", "/")
59
+ end
60
+ when Array
61
+ input.map { |item| correct_separator(item) }
62
+ end
63
+ end
64
+
65
+ def rename(files, ext = "txt")
66
+ # num of digits necessary to name the last file generated
67
+ maxwidth = 0
68
+
69
+ files.each do |f|
70
+ width = f.slice(/-(\d+)\z/, 1).to_s.length.to_i
71
+ maxwidth = width if maxwidth < width
72
+ newname = f.sub(/-(\d+)\z/) do
73
+ "-" + format("%0#{maxwidth}d", $1.to_i)
74
+ end
75
+ File.rename(f, newname + ".#{ext}")
76
+ end
77
+ true
78
+ end
79
+
80
+ # Convert int of seconds to string in the format 00:00:00
81
+ def sec_to_str(int)
82
+ unless int
83
+ str = "--:--:--"
84
+ return str
85
+ end
86
+ h = int / 3600
87
+ m = (int - h * 3600) / 60
88
+ s = int % 60
89
+ format("%02d:%02d:%02d", h, m, s)
90
+ end
91
+ end