wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
@@ -0,0 +1,134 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Wp2txt
4
+ # =========================================================================
5
+ # Custom Exception Classes
6
+ # =========================================================================
7
+ # Base error class for all Wp2txt errors
8
+ class Error < StandardError; end
9
+
10
+ # Raised when text parsing or conversion fails
11
+ class ParseError < Error; end
12
+
13
+ # Raised when network operations fail
14
+ class NetworkError < Error; end
15
+
16
+ # Raised when file I/O operations fail
17
+ class FileIOError < Error; end
18
+
19
+ # Raised when encoding conversion fails
20
+ class EncodingError < Error; end
21
+
22
+ # Raised when cache operations fail
23
+ class CacheError < Error; end
24
+
25
+ # =========================================================================
26
+ # Shared Constants
27
+ # =========================================================================
28
+ # Centralized constants to avoid magic numbers and duplication across files.
29
+ # This file should be required by all modules that need these values.
30
+
31
+ # ---------------------------------------------------------------------------
32
+ # Time Constants
33
+ # ---------------------------------------------------------------------------
34
+ SECONDS_PER_DAY = 86_400
35
+ SECONDS_PER_HOUR = 3_600
36
+ SECONDS_PER_MINUTE = 60
37
+
38
+ # ---------------------------------------------------------------------------
39
+ # Cache Settings
40
+ # ---------------------------------------------------------------------------
41
+ # Default expiry for downloaded Wikipedia dump files
42
+ DEFAULT_DUMP_EXPIRY_DAYS = 30
43
+
44
+ # Default expiry for category member cache
45
+ DEFAULT_CATEGORY_CACHE_EXPIRY_DAYS = 7
46
+
47
+ # ---------------------------------------------------------------------------
48
+ # Network Settings
49
+ # ---------------------------------------------------------------------------
50
+ # Default timeout for HTTP requests (seconds)
51
+ DEFAULT_HTTP_TIMEOUT = 30
52
+
53
+ # Default progress reporting interval (seconds)
54
+ DEFAULT_PROGRESS_INTERVAL = 10
55
+
56
+ # Index parsing progress reporting threshold (entries)
57
+ INDEX_PROGRESS_THRESHOLD = 500_000
58
+
59
+ # Default number of top section headings to include in stats output
60
+ DEFAULT_TOP_N_SECTIONS = 50
61
+
62
+ # Download resume metadata max age (days)
63
+ RESUME_METADATA_MAX_AGE_DAYS = 7
64
+
65
+ # ---------------------------------------------------------------------------
66
+ # Processing Limits
67
+ # ---------------------------------------------------------------------------
68
+ # Safety limit for deeply nested structure processing (templates, tables, etc.)
69
+ # This prevents infinite loops in malformed markup
70
+ MAX_NESTING_ITERATIONS = 50_000
71
+
72
+ # Buffer size for file reading (10 MB)
73
+ # Optimized for Wikipedia dump processing
74
+ DEFAULT_BUFFER_SIZE = 10_485_760
75
+
76
+ # Minimum buffer size (1 MB) - don't go below this
77
+ MIN_BUFFER_SIZE = 1_048_576
78
+
79
+ # Maximum buffer size (100 MB) - don't exceed this
80
+ MAX_BUFFER_SIZE = 104_857_600
81
+
82
+ # ---------------------------------------------------------------------------
83
+ # File Size Units (Binary - for accurate file sizes)
84
+ # ---------------------------------------------------------------------------
85
+ BYTES_PER_KB = 1_024
86
+ BYTES_PER_MB = 1_024 * 1_024
87
+ BYTES_PER_GB = 1_024 * 1_024 * 1_024
88
+
89
+ # ---------------------------------------------------------------------------
90
+ # Helper Methods
91
+ # ---------------------------------------------------------------------------
92
+ module_function
93
+
94
+ # Convert days to seconds
95
+ # @param days [Integer, Float] Number of days
96
+ # @return [Integer] Seconds
97
+ def days_to_seconds(days)
98
+ (days * SECONDS_PER_DAY).to_i
99
+ end
100
+
101
+ # Check if a file is older than specified days
102
+ # @param path [String] File path
103
+ # @param days [Integer] Number of days
104
+ # @return [Boolean] true if file is fresh (not expired)
105
+ def file_fresh?(path, days)
106
+ return false unless File.exist?(path)
107
+
108
+ File.mtime(path) > Time.now - days_to_seconds(days)
109
+ end
110
+
111
+ # Calculate file age in days
112
+ # @param path [String] File path
113
+ # @return [Float, nil] Age in days, or nil if file doesn't exist
114
+ def file_age_days(path)
115
+ return nil unless File.exist?(path)
116
+
117
+ ((Time.now - File.mtime(path)) / SECONDS_PER_DAY).round(1)
118
+ end
119
+
120
+ # Format file size in human-readable form (binary units)
121
+ # @param bytes [Integer] Size in bytes
122
+ # @return [String] Formatted size (e.g., "1.5 MB")
123
+ def format_file_size(bytes)
124
+ if bytes < BYTES_PER_KB
125
+ "#{bytes} B"
126
+ elsif bytes < BYTES_PER_MB
127
+ "#{(bytes.to_f / BYTES_PER_KB).round(1)} KB"
128
+ elsif bytes < BYTES_PER_GB
129
+ "#{(bytes.to_f / BYTES_PER_MB).round(1)} MB"
130
+ else
131
+ "#{(bytes.to_f / BYTES_PER_GB).round(2)} GB"
132
+ end
133
+ end
134
+ end