wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
|
@@ -0,0 +1,545 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "multistream"
|
|
4
|
+
require_relative "cli"
|
|
5
|
+
|
|
6
|
+
module Wp2txt
|
|
7
|
+
# Article extraction utilities for WpApp
|
|
8
|
+
module Extractor
|
|
9
|
+
# Exit codes
|
|
10
|
+
EXIT_SUCCESS = 0
|
|
11
|
+
EXIT_ERROR = 1
|
|
12
|
+
EXIT_PARTIAL = 2
|
|
13
|
+
|
|
14
|
+
# Extract specific articles by title
|
|
15
|
+
def extract_specific_articles(opts)
|
|
16
|
+
lang = opts[:lang]
|
|
17
|
+
cache_dir = opts[:cache_dir]
|
|
18
|
+
article_titles = Wp2txt::CLI.parse_article_list(opts[:articles])
|
|
19
|
+
app_config = Wp2txt::CLI.config
|
|
20
|
+
force_update = opts[:update_cache]
|
|
21
|
+
total_steps = 4
|
|
22
|
+
start_time = Time.now
|
|
23
|
+
|
|
24
|
+
# Mode banner
|
|
25
|
+
articles_display = article_titles.size > 3 ? "#{article_titles.first(3).join(', ')}... (#{article_titles.size} total)" : article_titles.join(", ")
|
|
26
|
+
print_mode_banner("Article Extraction", {
|
|
27
|
+
"Language" => lang,
|
|
28
|
+
"Articles" => articles_display,
|
|
29
|
+
"Output" => opts[:output_dir]
|
|
30
|
+
})
|
|
31
|
+
|
|
32
|
+
# Create dump manager
|
|
33
|
+
manager = Wp2txt::DumpManager.new(
|
|
34
|
+
lang,
|
|
35
|
+
cache_dir: cache_dir,
|
|
36
|
+
dump_expiry_days: app_config.dump_expiry_days
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# Step 1: Download index
|
|
40
|
+
print_header("Downloading index", step: 1, total_steps: total_steps)
|
|
41
|
+
index_path = manager.download_index(force: force_update)
|
|
42
|
+
|
|
43
|
+
# Step 2: Load index and find articles
|
|
44
|
+
print_header("Locating articles", step: 2, total_steps: total_steps)
|
|
45
|
+
|
|
46
|
+
# Use SQLite cache for fast repeated access
|
|
47
|
+
# Early termination: stop parsing when all target articles are found (if not using cache)
|
|
48
|
+
index = Wp2txt::MultistreamIndex.new(
|
|
49
|
+
index_path,
|
|
50
|
+
cache_dir: cache_dir,
|
|
51
|
+
target_titles: article_titles
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
if index.loaded_from_cache?
|
|
55
|
+
puts " Index loaded from cache (#{index.size} entries)"
|
|
56
|
+
elsif index.early_terminated?
|
|
57
|
+
puts " Early termination: found all #{article_titles.size} articles"
|
|
58
|
+
end
|
|
59
|
+
puts
|
|
60
|
+
|
|
61
|
+
# Find requested articles
|
|
62
|
+
found_articles = []
|
|
63
|
+
not_found = []
|
|
64
|
+
|
|
65
|
+
article_titles.each do |title|
|
|
66
|
+
entry = index.find_by_title(title)
|
|
67
|
+
if entry
|
|
68
|
+
found_articles << entry
|
|
69
|
+
print_list_item("#{title}", status: :success)
|
|
70
|
+
else
|
|
71
|
+
not_found << title
|
|
72
|
+
print_list_item("#{title} (not found)", status: :error)
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
if found_articles.empty?
|
|
77
|
+
puts unless quiet?
|
|
78
|
+
print_error("No articles found. Please check the titles.")
|
|
79
|
+
return EXIT_ERROR
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Step 3: Download streams
|
|
83
|
+
streams_needed = found_articles.map { |e| e[:offset] }.uniq.sort
|
|
84
|
+
print_header("Downloading data (#{streams_needed.size} streams)", step: 3, total_steps: total_steps)
|
|
85
|
+
multistream_path = download_partial_streams(manager, index, streams_needed, force: force_update)
|
|
86
|
+
|
|
87
|
+
# Create multistream reader, reusing the existing index (avoids double parsing)
|
|
88
|
+
reader = Wp2txt::MultistreamReader.new(multistream_path, index)
|
|
89
|
+
|
|
90
|
+
# Build config for processing
|
|
91
|
+
format = opts[:format].to_s.downcase.to_sym
|
|
92
|
+
config = build_extraction_config(opts, format)
|
|
93
|
+
|
|
94
|
+
# Create output writer
|
|
95
|
+
base_name = "#{lang}wiki_articles"
|
|
96
|
+
writer = OutputWriter.new(
|
|
97
|
+
output_dir: opts[:output_dir],
|
|
98
|
+
base_name: base_name,
|
|
99
|
+
format: format,
|
|
100
|
+
file_size_mb: opts[:file_size]
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# Step 4: Extract articles
|
|
104
|
+
print_header("Extracting articles", step: 4, total_steps: total_steps)
|
|
105
|
+
extracted_count = 0
|
|
106
|
+
extraction_failures = []
|
|
107
|
+
|
|
108
|
+
found_articles.each do |entry|
|
|
109
|
+
title = entry[:title]
|
|
110
|
+
page = reader.extract_article(title)
|
|
111
|
+
|
|
112
|
+
if page
|
|
113
|
+
article = Article.new(page[:text], page[:title], !config[:marker])
|
|
114
|
+
result = format_article(article, config)
|
|
115
|
+
writer.write(result)
|
|
116
|
+
extracted_count += 1
|
|
117
|
+
print_list_item("#{title}", status: :success)
|
|
118
|
+
else
|
|
119
|
+
extraction_failures << title
|
|
120
|
+
print_list_item("#{title} (extraction failed)", status: :warning)
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# Close output
|
|
125
|
+
output_files = writer.close
|
|
126
|
+
total_time = Time.now - start_time
|
|
127
|
+
|
|
128
|
+
# Summary
|
|
129
|
+
has_issues = not_found.any? || extraction_failures.any?
|
|
130
|
+
status = has_issues ? :warning : :success
|
|
131
|
+
|
|
132
|
+
print_summary("Extraction Complete", {
|
|
133
|
+
"Extracted" => "#{extracted_count}/#{article_titles.size}",
|
|
134
|
+
"Output files" => output_files.size.to_s,
|
|
135
|
+
"Total time" => format_duration(total_time)
|
|
136
|
+
}, status: status)
|
|
137
|
+
|
|
138
|
+
if not_found.any?
|
|
139
|
+
puts unless quiet?
|
|
140
|
+
print_warning("Not found in index (#{not_found.size}):")
|
|
141
|
+
not_found.each { |t| print_list_item(t, status: :error) }
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
puts unless quiet?
|
|
145
|
+
puts pastel.dim("Output files:") unless quiet?
|
|
146
|
+
output_files.each { |f| print_list_item(f, status: :success) }
|
|
147
|
+
|
|
148
|
+
# Return appropriate exit code
|
|
149
|
+
has_issues ? EXIT_PARTIAL : EXIT_SUCCESS
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
# Download only the streams containing the requested articles
|
|
153
|
+
# @param manager [DumpManager] The dump manager
|
|
154
|
+
# @param index [MultistreamIndex] The multistream index
|
|
155
|
+
# @param stream_offsets [Array<Integer>] Byte offsets of streams to download
|
|
156
|
+
# @param force [Boolean] Force re-download even if cached
|
|
157
|
+
def download_partial_streams(manager, index, stream_offsets, force: false)
|
|
158
|
+
# Calculate how many streams are needed
|
|
159
|
+
all_offsets = index.stream_offsets
|
|
160
|
+
max_offset_needed = stream_offsets.max
|
|
161
|
+
|
|
162
|
+
# Find the index of the highest needed stream
|
|
163
|
+
max_idx = all_offsets.index(max_offset_needed)
|
|
164
|
+
if max_idx.nil? || max_idx >= all_offsets.size - 1
|
|
165
|
+
# Need full file for last stream
|
|
166
|
+
return manager.download_multistream(force: force)
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
# Request partial download (DumpManager handles caching logic)
|
|
170
|
+
stream_count = max_idx + 1
|
|
171
|
+
manager.download_multistream(max_streams: stream_count, force: force)
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
# Show download estimate before confirmation
|
|
175
|
+
# @param manager [DumpManager] The dump manager
|
|
176
|
+
# @param app_config [Config] Application configuration
|
|
177
|
+
# @return [Boolean] True if cache is stale and user should be warned
|
|
178
|
+
def show_download_estimate(manager, app_config = nil)
|
|
179
|
+
puts pastel.dim("Download status:")
|
|
180
|
+
|
|
181
|
+
# Check index cache
|
|
182
|
+
index_path = manager.cached_index_path
|
|
183
|
+
index_cached = File.exist?(index_path)
|
|
184
|
+
cache_is_stale = false
|
|
185
|
+
|
|
186
|
+
if index_cached
|
|
187
|
+
index_size = format_size(File.size(index_path))
|
|
188
|
+
age_days = manager.cache_age_days
|
|
189
|
+
mtime = manager.cache_mtime
|
|
190
|
+
expiry_days = app_config&.dump_expiry_days || manager.dump_expiry_days
|
|
191
|
+
|
|
192
|
+
# Format cache date
|
|
193
|
+
cache_date_str = mtime ? mtime.strftime("%Y-%m-%d") : "unknown"
|
|
194
|
+
|
|
195
|
+
# Check if stale
|
|
196
|
+
cache_is_stale = age_days && age_days > expiry_days
|
|
197
|
+
|
|
198
|
+
if cache_is_stale
|
|
199
|
+
age_str = age_days >= 1 ? "#{age_days.round(0)} days ago" : "today"
|
|
200
|
+
print_list_item("Index: #{pastel.yellow('cached')} (#{index_size}, #{cache_date_str} - #{age_str})", status: :warning)
|
|
201
|
+
print_list_item(" Cache is older than #{expiry_days} days (recommended refresh)", status: :warning, indent: 2)
|
|
202
|
+
else
|
|
203
|
+
age_str = age_days && age_days >= 1 ? "#{age_days.round(0)} days ago" : "today"
|
|
204
|
+
print_list_item("Index: #{pastel.green('cached')} (#{index_size}, #{cache_date_str} - #{age_str})", status: :success)
|
|
205
|
+
end
|
|
206
|
+
else
|
|
207
|
+
print_list_item("Index: #{pastel.yellow('download required')}", status: :warning)
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
# Check multistream cache
|
|
211
|
+
full_path = manager.cached_multistream_path
|
|
212
|
+
full_cached = File.exist?(full_path)
|
|
213
|
+
|
|
214
|
+
if full_cached
|
|
215
|
+
dump_size = format_size(File.size(full_path))
|
|
216
|
+
dump_date_str = File.mtime(full_path).strftime("%Y-%m-%d")
|
|
217
|
+
dump_age = ((Time.now - File.mtime(full_path)) / 86400).round(0)
|
|
218
|
+
dump_age_str = dump_age >= 1 ? "#{dump_age} days ago" : "today"
|
|
219
|
+
|
|
220
|
+
if cache_is_stale
|
|
221
|
+
print_list_item("Dump: #{pastel.yellow('cached')} (#{dump_size}, #{dump_date_str} - #{dump_age_str})", status: :warning)
|
|
222
|
+
else
|
|
223
|
+
print_list_item("Dump: #{pastel.green('cached')} (#{dump_size}, #{dump_date_str} - #{dump_age_str})", status: :success)
|
|
224
|
+
end
|
|
225
|
+
else
|
|
226
|
+
# Check for partial downloads
|
|
227
|
+
partial = manager.find_suitable_partial_cache(1)
|
|
228
|
+
if partial
|
|
229
|
+
partial_size = format_size(File.size(partial))
|
|
230
|
+
partial_date = File.mtime(partial).strftime("%Y-%m-%d")
|
|
231
|
+
print_list_item("Dump: #{pastel.cyan('partial cached')} (#{partial_size}, #{partial_date})", status: :success)
|
|
232
|
+
print_list_item(" Additional download may be required depending on article locations", status: :pending)
|
|
233
|
+
else
|
|
234
|
+
print_list_item("Dump: #{pastel.yellow('download required')} (several GB)", status: :warning)
|
|
235
|
+
end
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
if cache_is_stale
|
|
239
|
+
puts
|
|
240
|
+
print_warning("Cache is stale. Use --update-cache to force refresh.")
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
puts
|
|
244
|
+
cache_is_stale
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
# Build config hash for article extraction
|
|
248
|
+
def build_extraction_config(opts, format)
|
|
249
|
+
config = {
|
|
250
|
+
format: format,
|
|
251
|
+
num_procs: 1, # Single-threaded for article extraction
|
|
252
|
+
file_size: opts[:file_size],
|
|
253
|
+
bz2_gem: opts[:bz2_gem]
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
%i[title list heading table redirect multiline category category_only
|
|
257
|
+
summary_only marker extract_citations].each do |opt|
|
|
258
|
+
config[opt] = opts[opt]
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
# Section extraction options
|
|
262
|
+
%i[sections section_output min_section_length skip_empty
|
|
263
|
+
alias_file no_section_aliases show_matched_sections].each do |opt|
|
|
264
|
+
config[opt] = opts[opt]
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
# Parse sections string into array if provided
|
|
268
|
+
if config[:sections].is_a?(String)
|
|
269
|
+
config[:sections] = config[:sections].split(",").map(&:strip).reject(&:empty?)
|
|
270
|
+
end
|
|
271
|
+
|
|
272
|
+
config[:markers] = parse_markers_option(opts[:markers])
|
|
273
|
+
config
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
# Extract articles from a Wikipedia category
|
|
277
|
+
def extract_category_articles(opts)
|
|
278
|
+
lang = opts[:lang]
|
|
279
|
+
category = opts[:from_category]
|
|
280
|
+
max_depth = opts[:depth]
|
|
281
|
+
cache_dir = opts[:cache_dir]
|
|
282
|
+
dry_run = opts[:dry_run]
|
|
283
|
+
skip_confirm = opts[:yes]
|
|
284
|
+
total_steps = 6
|
|
285
|
+
start_time = Time.now
|
|
286
|
+
|
|
287
|
+
# Mode banner
|
|
288
|
+
print_mode_banner("Category Extraction", {
|
|
289
|
+
"Language" => lang,
|
|
290
|
+
"Category" => category,
|
|
291
|
+
"Depth" => max_depth,
|
|
292
|
+
"Output" => opts[:output_dir]
|
|
293
|
+
})
|
|
294
|
+
|
|
295
|
+
# Get config values
|
|
296
|
+
app_config = Wp2txt::CLI.config
|
|
297
|
+
|
|
298
|
+
# Create category fetcher
|
|
299
|
+
fetcher = Wp2txt::CategoryFetcher.new(
|
|
300
|
+
lang, category,
|
|
301
|
+
max_depth: max_depth,
|
|
302
|
+
cache_expiry_days: app_config.category_expiry_days
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
# Step 1: Fetch preview
|
|
306
|
+
print_header("Scanning category", step: 1, total_steps: total_steps)
|
|
307
|
+
spinner = create_spinner("Fetching category information...")
|
|
308
|
+
spinner.auto_spin
|
|
309
|
+
|
|
310
|
+
begin
|
|
311
|
+
preview = fetcher.fetch_preview
|
|
312
|
+
spinner.success(pastel.green("Done!"))
|
|
313
|
+
rescue Net::OpenTimeout, Net::ReadTimeout, SocketError, Errno::ECONNREFUSED, OpenSSL::SSL::SSLError, JSON::ParserError, IOError => e
|
|
314
|
+
spinner.error(pastel.red("Failed!"))
|
|
315
|
+
print_error("Error fetching category: #{e.message}")
|
|
316
|
+
return EXIT_ERROR
|
|
317
|
+
end
|
|
318
|
+
|
|
319
|
+
# Display preview
|
|
320
|
+
print_subheader("Category Preview")
|
|
321
|
+
print_info("Category", preview[:category])
|
|
322
|
+
print_info("Depth", preview[:depth].to_s)
|
|
323
|
+
puts
|
|
324
|
+
|
|
325
|
+
if preview[:subcategories] && !preview[:subcategories].empty?
|
|
326
|
+
puts pastel.dim("Categories scanned:")
|
|
327
|
+
preview[:subcategories].each do |subcat|
|
|
328
|
+
print_list_item("#{subcat[:name]} (#{subcat[:article_count]} articles)")
|
|
329
|
+
end
|
|
330
|
+
puts
|
|
331
|
+
end
|
|
332
|
+
|
|
333
|
+
puts pastel.dim("Summary:")
|
|
334
|
+
print_info("Subcategories", (preview[:total_subcategories] || 0).to_s, indent: 1)
|
|
335
|
+
print_info("Total articles", pastel.bold(preview[:total_articles].to_s), indent: 1)
|
|
336
|
+
puts
|
|
337
|
+
|
|
338
|
+
# Check if there are any articles
|
|
339
|
+
if preview[:total_articles].zero?
|
|
340
|
+
print_warning("No articles found in this category.")
|
|
341
|
+
return EXIT_SUCCESS # Not an error, just empty category
|
|
342
|
+
end
|
|
343
|
+
|
|
344
|
+
# Warn about large extractions
|
|
345
|
+
if preview[:total_articles] > 1000
|
|
346
|
+
print_warning("Large category with #{preview[:total_articles]} articles.")
|
|
347
|
+
puts pastel.yellow(" Extraction may take a long time and require significant disk space.")
|
|
348
|
+
puts
|
|
349
|
+
end
|
|
350
|
+
|
|
351
|
+
# Show cache status before confirmation
|
|
352
|
+
temp_manager = Wp2txt::DumpManager.new(
|
|
353
|
+
lang,
|
|
354
|
+
cache_dir: cache_dir,
|
|
355
|
+
dump_expiry_days: app_config.dump_expiry_days
|
|
356
|
+
)
|
|
357
|
+
cache_stale = show_download_estimate(temp_manager, app_config)
|
|
358
|
+
force_update = opts[:update_cache]
|
|
359
|
+
|
|
360
|
+
# Dry run mode - exit here
|
|
361
|
+
if dry_run
|
|
362
|
+
print_info_message("Dry run mode - no articles will be extracted.")
|
|
363
|
+
return EXIT_SUCCESS
|
|
364
|
+
end
|
|
365
|
+
|
|
366
|
+
# Confirmation prompt
|
|
367
|
+
unless skip_confirm
|
|
368
|
+
unless $stdin.tty?
|
|
369
|
+
print_error("Interactive confirmation required.")
|
|
370
|
+
puts pastel.red(" Use --yes to skip confirmation when running non-interactively.")
|
|
371
|
+
return EXIT_ERROR
|
|
372
|
+
end
|
|
373
|
+
|
|
374
|
+
unless confirm?("Proceed with extraction?")
|
|
375
|
+
puts "Extraction cancelled."
|
|
376
|
+
return EXIT_SUCCESS # User chose to cancel
|
|
377
|
+
end
|
|
378
|
+
end
|
|
379
|
+
|
|
380
|
+
# Step 2: Fetch full article list
|
|
381
|
+
print_header("Fetching articles from API", step: 2, total_steps: total_steps)
|
|
382
|
+
spinner = create_spinner("Fetching article list...")
|
|
383
|
+
spinner.auto_spin
|
|
384
|
+
|
|
385
|
+
begin
|
|
386
|
+
article_titles = fetcher.fetch_articles
|
|
387
|
+
spinner.success(pastel.green("#{article_titles.size} articles"))
|
|
388
|
+
rescue Net::OpenTimeout, Net::ReadTimeout, SocketError, Errno::ECONNREFUSED, OpenSSL::SSL::SSLError, JSON::ParserError, IOError => e
|
|
389
|
+
spinner.error(pastel.red("Failed!"))
|
|
390
|
+
print_error("Error fetching articles: #{e.message}")
|
|
391
|
+
return EXIT_ERROR
|
|
392
|
+
end
|
|
393
|
+
|
|
394
|
+
if article_titles.empty?
|
|
395
|
+
print_warning("No articles to extract.")
|
|
396
|
+
return EXIT_SUCCESS # Not an error, just empty result
|
|
397
|
+
end
|
|
398
|
+
|
|
399
|
+
# Create dump manager
|
|
400
|
+
manager = Wp2txt::DumpManager.new(
|
|
401
|
+
lang,
|
|
402
|
+
cache_dir: cache_dir,
|
|
403
|
+
dump_expiry_days: app_config.dump_expiry_days
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
# Step 3: Download index
|
|
407
|
+
print_header("Downloading index", step: 3, total_steps: total_steps)
|
|
408
|
+
index_path = manager.download_index(force: force_update)
|
|
409
|
+
|
|
410
|
+
# Step 4: Load index and locate articles
|
|
411
|
+
print_header("Locating articles in dump", step: 4, total_steps: total_steps)
|
|
412
|
+
|
|
413
|
+
# Use SQLite cache for fast repeated access
|
|
414
|
+
index = Wp2txt::MultistreamIndex.new(index_path, cache_dir: cache_dir)
|
|
415
|
+
|
|
416
|
+
if index.loaded_from_cache?
|
|
417
|
+
puts " Index loaded from cache (#{index.size} entries)"
|
|
418
|
+
else
|
|
419
|
+
puts " Index parsed (#{index.size} entries)"
|
|
420
|
+
end
|
|
421
|
+
|
|
422
|
+
# Find articles in index
|
|
423
|
+
found_articles = []
|
|
424
|
+
not_found = []
|
|
425
|
+
|
|
426
|
+
article_titles.each do |title|
|
|
427
|
+
entry = index.find_by_title(title)
|
|
428
|
+
if entry
|
|
429
|
+
found_articles << entry
|
|
430
|
+
else
|
|
431
|
+
not_found << title
|
|
432
|
+
end
|
|
433
|
+
end
|
|
434
|
+
|
|
435
|
+
print_list_item("Found in dump: #{found_articles.size}", status: :success)
|
|
436
|
+
print_list_item("Not in dump: #{not_found.size}", status: not_found.any? ? :warning : :success) if not_found.any?
|
|
437
|
+
|
|
438
|
+
if found_articles.empty?
|
|
439
|
+
print_error("No articles found in dump. The dump may be out of date.")
|
|
440
|
+
return EXIT_ERROR
|
|
441
|
+
end
|
|
442
|
+
|
|
443
|
+
# Step 5: Download multistream
|
|
444
|
+
streams_needed = found_articles.map { |e| e[:offset] }.uniq.sort
|
|
445
|
+
print_header("Downloading data (#{streams_needed.size} streams)", step: 5, total_steps: total_steps)
|
|
446
|
+
multistream_path = download_partial_streams(manager, index, streams_needed, force: force_update)
|
|
447
|
+
|
|
448
|
+
# Create multistream reader, reusing the existing index (avoids double parsing)
|
|
449
|
+
reader = Wp2txt::MultistreamReader.new(multistream_path, index)
|
|
450
|
+
|
|
451
|
+
# Build config
|
|
452
|
+
format = opts[:format].to_s.downcase.to_sym
|
|
453
|
+
config = build_extraction_config(opts, format)
|
|
454
|
+
|
|
455
|
+
# Create output writer
|
|
456
|
+
base_name = "#{lang}wiki_#{sanitize_filename(category)}"
|
|
457
|
+
writer = OutputWriter.new(
|
|
458
|
+
output_dir: opts[:output_dir],
|
|
459
|
+
base_name: base_name,
|
|
460
|
+
format: format,
|
|
461
|
+
file_size_mb: opts[:file_size]
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
# Step 6: Extract and process articles
|
|
465
|
+
print_header("Extracting articles", step: 6, total_steps: total_steps)
|
|
466
|
+
total_count = found_articles.size
|
|
467
|
+
bar = create_progress_bar(" Processing", total_count)
|
|
468
|
+
|
|
469
|
+
extracted_count = 0
|
|
470
|
+
extraction_start = Time.now
|
|
471
|
+
|
|
472
|
+
# Use parallel extraction for large batches (>50 articles across multiple streams)
|
|
473
|
+
streams_count = found_articles.map { |e| e[:offset] }.uniq.size
|
|
474
|
+
use_parallel = total_count > 50 && streams_count > 1
|
|
475
|
+
|
|
476
|
+
if use_parallel
|
|
477
|
+
# Parallel extraction: process streams concurrently
|
|
478
|
+
num_procs = [streams_count, 4].min # Cap at 4 processes
|
|
479
|
+
pages = reader.each_article_parallel(found_articles, num_processes: num_procs).to_a
|
|
480
|
+
|
|
481
|
+
pages.each do |page|
|
|
482
|
+
article = Article.new(page[:text], page[:title], !config[:marker])
|
|
483
|
+
result = format_article(article, config)
|
|
484
|
+
writer.write(result)
|
|
485
|
+
extracted_count += 1
|
|
486
|
+
bar.advance
|
|
487
|
+
end
|
|
488
|
+
else
|
|
489
|
+
# Sequential extraction for small batches
|
|
490
|
+
found_articles.each do |entry|
|
|
491
|
+
title = entry[:title]
|
|
492
|
+
page = reader.extract_article(title)
|
|
493
|
+
|
|
494
|
+
if page
|
|
495
|
+
article = Article.new(page[:text], page[:title], !config[:marker])
|
|
496
|
+
result = format_article(article, config)
|
|
497
|
+
writer.write(result)
|
|
498
|
+
extracted_count += 1
|
|
499
|
+
end
|
|
500
|
+
|
|
501
|
+
bar.advance
|
|
502
|
+
end
|
|
503
|
+
end
|
|
504
|
+
|
|
505
|
+
bar.finish
|
|
506
|
+
extraction_time = Time.now - extraction_start
|
|
507
|
+
|
|
508
|
+
# Close output
|
|
509
|
+
output_files = writer.close
|
|
510
|
+
|
|
511
|
+
# Summary
|
|
512
|
+
total_time = Time.now - start_time
|
|
513
|
+
status = not_found.empty? ? :success : :warning
|
|
514
|
+
|
|
515
|
+
print_summary("Extraction Complete", {
|
|
516
|
+
"Articles extracted" => "#{extracted_count}/#{article_titles.size}",
|
|
517
|
+
"Output files" => output_files.size.to_s,
|
|
518
|
+
"Extraction time" => format_duration(extraction_time),
|
|
519
|
+
"Total time" => format_duration(total_time)
|
|
520
|
+
}, status: status)
|
|
521
|
+
|
|
522
|
+
if not_found.any?
|
|
523
|
+
puts unless quiet?
|
|
524
|
+
if not_found.size <= 10
|
|
525
|
+
print_warning("Not found in dump (#{not_found.size}):")
|
|
526
|
+
not_found.each { |t| print_list_item(t, status: :warning) }
|
|
527
|
+
else
|
|
528
|
+
print_warning("#{not_found.size} articles not found (may be newer than dump)")
|
|
529
|
+
end
|
|
530
|
+
end
|
|
531
|
+
|
|
532
|
+
puts unless quiet?
|
|
533
|
+
puts pastel.dim("Output files:") unless quiet?
|
|
534
|
+
output_files.each { |f| print_list_item(f, status: :success) }
|
|
535
|
+
|
|
536
|
+
# Return appropriate exit code
|
|
537
|
+
not_found.empty? ? EXIT_SUCCESS : EXIT_PARTIAL
|
|
538
|
+
end
|
|
539
|
+
|
|
540
|
+
# Sanitize category name for use in filename
|
|
541
|
+
def sanitize_filename(name)
|
|
542
|
+
name.gsub(%r{[/\\:*?"<>|]}, "_").gsub(/\s+/, "_").slice(0, 50)
|
|
543
|
+
end
|
|
544
|
+
end
|
|
545
|
+
end
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "find"
|
|
4
|
+
|
|
5
|
+
module Wp2txt
|
|
6
|
+
# File operation utilities
|
|
7
|
+
|
|
8
|
+
# Collect filenames recursively
|
|
9
|
+
def collect_files(str, regex = nil)
|
|
10
|
+
regex ||= //
|
|
11
|
+
text_array = []
|
|
12
|
+
Find.find(str) do |f|
|
|
13
|
+
text_array << f if regex =~ f
|
|
14
|
+
end
|
|
15
|
+
text_array.sort
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Modify a file using block/yield mechanism
|
|
19
|
+
def file_mod(file_path, backup = false)
|
|
20
|
+
str = File.read(file_path)
|
|
21
|
+
newstr = yield(str)
|
|
22
|
+
str = newstr unless newstr.nil?
|
|
23
|
+
|
|
24
|
+
require "tempfile"
|
|
25
|
+
dir = File.dirname(file_path)
|
|
26
|
+
temp = Tempfile.new(["wp2txt_", File.extname(file_path)], dir)
|
|
27
|
+
begin
|
|
28
|
+
temp.write(str)
|
|
29
|
+
temp.close
|
|
30
|
+
File.rename(file_path, file_path + ".bak")
|
|
31
|
+
File.rename(temp.path, file_path)
|
|
32
|
+
File.unlink(file_path + ".bak") unless backup
|
|
33
|
+
rescue StandardError
|
|
34
|
+
temp.close! rescue nil # rubocop:disable Style/RescueModifier
|
|
35
|
+
raise
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Modify files under a directory (recursive)
|
|
40
|
+
def batch_file_mod(dir_path)
|
|
41
|
+
if FileTest.directory?(dir_path)
|
|
42
|
+
collect_files(dir_path).each do |file|
|
|
43
|
+
yield file if FileTest.file?(file)
|
|
44
|
+
end
|
|
45
|
+
elsif FileTest.file?(dir_path)
|
|
46
|
+
yield dir_path
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Take care of difference of separators among environments
|
|
51
|
+
def correct_separator(input)
|
|
52
|
+
case input
|
|
53
|
+
when String
|
|
54
|
+
# Use tr instead of gsub for simple character replacement (faster)
|
|
55
|
+
if RUBY_PLATFORM.index("win32")
|
|
56
|
+
input.tr("/", "\\")
|
|
57
|
+
else
|
|
58
|
+
input.tr("\\", "/")
|
|
59
|
+
end
|
|
60
|
+
when Array
|
|
61
|
+
input.map { |item| correct_separator(item) }
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def rename(files, ext = "txt")
|
|
66
|
+
# num of digits necessary to name the last file generated
|
|
67
|
+
maxwidth = 0
|
|
68
|
+
|
|
69
|
+
files.each do |f|
|
|
70
|
+
width = f.slice(/-(\d+)\z/, 1).to_s.length.to_i
|
|
71
|
+
maxwidth = width if maxwidth < width
|
|
72
|
+
newname = f.sub(/-(\d+)\z/) do
|
|
73
|
+
"-" + format("%0#{maxwidth}d", $1.to_i)
|
|
74
|
+
end
|
|
75
|
+
File.rename(f, newname + ".#{ext}")
|
|
76
|
+
end
|
|
77
|
+
true
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Convert int of seconds to string in the format 00:00:00
|
|
81
|
+
def sec_to_str(int)
|
|
82
|
+
unless int
|
|
83
|
+
str = "--:--:--"
|
|
84
|
+
return str
|
|
85
|
+
end
|
|
86
|
+
h = int / 3600
|
|
87
|
+
m = (int - h * 3600) / 60
|
|
88
|
+
s = int % 60
|
|
89
|
+
format("%02d:%02d:%02d", h, m, s)
|
|
90
|
+
end
|
|
91
|
+
end
|