wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Wp2txt
|
|
4
|
+
# =========================================================================
|
|
5
|
+
# Custom Exception Classes
|
|
6
|
+
# =========================================================================
|
|
7
|
+
# Base error class for all Wp2txt errors
|
|
8
|
+
class Error < StandardError; end
|
|
9
|
+
|
|
10
|
+
# Raised when text parsing or conversion fails
|
|
11
|
+
class ParseError < Error; end
|
|
12
|
+
|
|
13
|
+
# Raised when network operations fail
|
|
14
|
+
class NetworkError < Error; end
|
|
15
|
+
|
|
16
|
+
# Raised when file I/O operations fail
|
|
17
|
+
class FileIOError < Error; end
|
|
18
|
+
|
|
19
|
+
# Raised when encoding conversion fails
|
|
20
|
+
class EncodingError < Error; end
|
|
21
|
+
|
|
22
|
+
# Raised when cache operations fail
|
|
23
|
+
class CacheError < Error; end
|
|
24
|
+
|
|
25
|
+
# =========================================================================
|
|
26
|
+
# Shared Constants
|
|
27
|
+
# =========================================================================
|
|
28
|
+
# Centralized constants to avoid magic numbers and duplication across files.
|
|
29
|
+
# This file should be required by all modules that need these values.
|
|
30
|
+
|
|
31
|
+
# ---------------------------------------------------------------------------
|
|
32
|
+
# Time Constants
|
|
33
|
+
# ---------------------------------------------------------------------------
|
|
34
|
+
SECONDS_PER_DAY = 86_400
|
|
35
|
+
SECONDS_PER_HOUR = 3_600
|
|
36
|
+
SECONDS_PER_MINUTE = 60
|
|
37
|
+
|
|
38
|
+
# ---------------------------------------------------------------------------
|
|
39
|
+
# Cache Settings
|
|
40
|
+
# ---------------------------------------------------------------------------
|
|
41
|
+
# Default expiry for downloaded Wikipedia dump files
|
|
42
|
+
DEFAULT_DUMP_EXPIRY_DAYS = 30
|
|
43
|
+
|
|
44
|
+
# Default expiry for category member cache
|
|
45
|
+
DEFAULT_CATEGORY_CACHE_EXPIRY_DAYS = 7
|
|
46
|
+
|
|
47
|
+
# ---------------------------------------------------------------------------
|
|
48
|
+
# Network Settings
|
|
49
|
+
# ---------------------------------------------------------------------------
|
|
50
|
+
# Default timeout for HTTP requests (seconds)
|
|
51
|
+
DEFAULT_HTTP_TIMEOUT = 30
|
|
52
|
+
|
|
53
|
+
# Default progress reporting interval (seconds)
|
|
54
|
+
DEFAULT_PROGRESS_INTERVAL = 10
|
|
55
|
+
|
|
56
|
+
# Index parsing progress reporting threshold (entries)
|
|
57
|
+
INDEX_PROGRESS_THRESHOLD = 500_000
|
|
58
|
+
|
|
59
|
+
# Default number of top section headings to include in stats output
|
|
60
|
+
DEFAULT_TOP_N_SECTIONS = 50
|
|
61
|
+
|
|
62
|
+
# Download resume metadata max age (days)
|
|
63
|
+
RESUME_METADATA_MAX_AGE_DAYS = 7
|
|
64
|
+
|
|
65
|
+
# ---------------------------------------------------------------------------
|
|
66
|
+
# Processing Limits
|
|
67
|
+
# ---------------------------------------------------------------------------
|
|
68
|
+
# Safety limit for deeply nested structure processing (templates, tables, etc.)
|
|
69
|
+
# This prevents infinite loops in malformed markup
|
|
70
|
+
MAX_NESTING_ITERATIONS = 50_000
|
|
71
|
+
|
|
72
|
+
# Buffer size for file reading (10 MB)
|
|
73
|
+
# Optimized for Wikipedia dump processing
|
|
74
|
+
DEFAULT_BUFFER_SIZE = 10_485_760
|
|
75
|
+
|
|
76
|
+
# Minimum buffer size (1 MB) - don't go below this
|
|
77
|
+
MIN_BUFFER_SIZE = 1_048_576
|
|
78
|
+
|
|
79
|
+
# Maximum buffer size (100 MB) - don't exceed this
|
|
80
|
+
MAX_BUFFER_SIZE = 104_857_600
|
|
81
|
+
|
|
82
|
+
# ---------------------------------------------------------------------------
|
|
83
|
+
# File Size Units (Binary - for accurate file sizes)
|
|
84
|
+
# ---------------------------------------------------------------------------
|
|
85
|
+
BYTES_PER_KB = 1_024
|
|
86
|
+
BYTES_PER_MB = 1_024 * 1_024
|
|
87
|
+
BYTES_PER_GB = 1_024 * 1_024 * 1_024
|
|
88
|
+
|
|
89
|
+
# ---------------------------------------------------------------------------
|
|
90
|
+
# Helper Methods
|
|
91
|
+
# ---------------------------------------------------------------------------
|
|
92
|
+
module_function
|
|
93
|
+
|
|
94
|
+
# Convert days to seconds
|
|
95
|
+
# @param days [Integer, Float] Number of days
|
|
96
|
+
# @return [Integer] Seconds
|
|
97
|
+
def days_to_seconds(days)
|
|
98
|
+
(days * SECONDS_PER_DAY).to_i
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Check if a file is older than specified days
|
|
102
|
+
# @param path [String] File path
|
|
103
|
+
# @param days [Integer] Number of days
|
|
104
|
+
# @return [Boolean] true if file is fresh (not expired)
|
|
105
|
+
def file_fresh?(path, days)
|
|
106
|
+
return false unless File.exist?(path)
|
|
107
|
+
|
|
108
|
+
File.mtime(path) > Time.now - days_to_seconds(days)
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Calculate file age in days
|
|
112
|
+
# @param path [String] File path
|
|
113
|
+
# @return [Float, nil] Age in days, or nil if file doesn't exist
|
|
114
|
+
def file_age_days(path)
|
|
115
|
+
return nil unless File.exist?(path)
|
|
116
|
+
|
|
117
|
+
((Time.now - File.mtime(path)) / SECONDS_PER_DAY).round(1)
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
# Format file size in human-readable form (binary units)
|
|
121
|
+
# @param bytes [Integer] Size in bytes
|
|
122
|
+
# @return [String] Formatted size (e.g., "1.5 MB")
|
|
123
|
+
def format_file_size(bytes)
|
|
124
|
+
if bytes < BYTES_PER_KB
|
|
125
|
+
"#{bytes} B"
|
|
126
|
+
elsif bytes < BYTES_PER_MB
|
|
127
|
+
"#{(bytes.to_f / BYTES_PER_KB).round(1)} KB"
|
|
128
|
+
elsif bytes < BYTES_PER_GB
|
|
129
|
+
"#{(bytes.to_f / BYTES_PER_MB).round(1)} MB"
|
|
130
|
+
else
|
|
131
|
+
"#{(bytes.to_f / BYTES_PER_GB).round(2)} GB"
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
end
|