wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
data/spec/config_spec.rb
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "spec_helper"
|
|
4
|
+
require_relative "../lib/wp2txt/config"
|
|
5
|
+
require "tmpdir"
|
|
6
|
+
require "fileutils"
|
|
7
|
+
|
|
8
|
+
RSpec.describe Wp2txt::Config do
|
|
9
|
+
let(:tmpdir) { Dir.mktmpdir("wp2txt_config_test_") }
|
|
10
|
+
let(:config_path) { File.join(tmpdir, "config.yml") }
|
|
11
|
+
|
|
12
|
+
after do
|
|
13
|
+
FileUtils.rm_rf(tmpdir)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
describe ".default_path" do
|
|
17
|
+
it "returns path in home directory" do
|
|
18
|
+
expect(described_class.default_path).to include(".wp2txt")
|
|
19
|
+
expect(described_class.default_path).to end_with("config.yml")
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
describe ".load" do
|
|
24
|
+
context "when config file does not exist" do
|
|
25
|
+
it "returns default configuration" do
|
|
26
|
+
config = described_class.load(config_path)
|
|
27
|
+
|
|
28
|
+
expect(config.dump_expiry_days).to eq 30
|
|
29
|
+
expect(config.category_expiry_days).to eq 7
|
|
30
|
+
expect(config.cache_directory).to eq Wp2txt::Config::DEFAULT_CACHE_DIR
|
|
31
|
+
expect(config.default_format).to eq "text"
|
|
32
|
+
expect(config.default_depth).to eq 0
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
context "when config file exists" do
|
|
37
|
+
it "loads settings from file" do
|
|
38
|
+
File.write(config_path, <<~YAML)
|
|
39
|
+
cache:
|
|
40
|
+
dump_expiry_days: 60
|
|
41
|
+
category_expiry_days: 14
|
|
42
|
+
directory: /custom/cache
|
|
43
|
+
defaults:
|
|
44
|
+
format: json
|
|
45
|
+
depth: 2
|
|
46
|
+
YAML
|
|
47
|
+
|
|
48
|
+
config = described_class.load(config_path)
|
|
49
|
+
|
|
50
|
+
expect(config.dump_expiry_days).to eq 60
|
|
51
|
+
expect(config.category_expiry_days).to eq 14
|
|
52
|
+
expect(config.cache_directory).to eq "/custom/cache"
|
|
53
|
+
expect(config.default_format).to eq "json"
|
|
54
|
+
expect(config.default_depth).to eq 2
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
it "uses defaults for missing keys" do
|
|
58
|
+
File.write(config_path, <<~YAML)
|
|
59
|
+
cache:
|
|
60
|
+
dump_expiry_days: 45
|
|
61
|
+
YAML
|
|
62
|
+
|
|
63
|
+
config = described_class.load(config_path)
|
|
64
|
+
|
|
65
|
+
expect(config.dump_expiry_days).to eq 45
|
|
66
|
+
expect(config.category_expiry_days).to eq 7 # default
|
|
67
|
+
expect(config.default_format).to eq "text" # default
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
it "handles empty file" do
|
|
71
|
+
File.write(config_path, "")
|
|
72
|
+
|
|
73
|
+
config = described_class.load(config_path)
|
|
74
|
+
|
|
75
|
+
expect(config.dump_expiry_days).to eq 30
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
it "handles malformed YAML gracefully" do
|
|
79
|
+
File.write(config_path, "invalid: yaml: content: [")
|
|
80
|
+
|
|
81
|
+
config = described_class.load(config_path)
|
|
82
|
+
|
|
83
|
+
# Should return defaults on parse error
|
|
84
|
+
expect(config.dump_expiry_days).to eq 30
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
describe "#save" do
|
|
90
|
+
it "writes configuration to file" do
|
|
91
|
+
config = described_class.new(
|
|
92
|
+
dump_expiry_days: 45,
|
|
93
|
+
category_expiry_days: 10,
|
|
94
|
+
cache_directory: "/my/cache",
|
|
95
|
+
default_format: "json",
|
|
96
|
+
default_depth: 1
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
config.save(config_path)
|
|
100
|
+
|
|
101
|
+
expect(File.exist?(config_path)).to be true
|
|
102
|
+
loaded = described_class.load(config_path)
|
|
103
|
+
expect(loaded.dump_expiry_days).to eq 45
|
|
104
|
+
expect(loaded.category_expiry_days).to eq 10
|
|
105
|
+
expect(loaded.cache_directory).to eq "/my/cache"
|
|
106
|
+
expect(loaded.default_format).to eq "json"
|
|
107
|
+
expect(loaded.default_depth).to eq 1
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
it "creates parent directories if needed" do
|
|
111
|
+
nested_path = File.join(tmpdir, "nested", "dir", "config.yml")
|
|
112
|
+
config = described_class.new
|
|
113
|
+
|
|
114
|
+
config.save(nested_path)
|
|
115
|
+
|
|
116
|
+
expect(File.exist?(nested_path)).to be true
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
describe ".create_default" do
|
|
121
|
+
it "creates a config file with default values" do
|
|
122
|
+
described_class.create_default(config_path)
|
|
123
|
+
|
|
124
|
+
expect(File.exist?(config_path)).to be true
|
|
125
|
+
content = File.read(config_path)
|
|
126
|
+
expect(content).to include("dump_expiry_days: 30")
|
|
127
|
+
expect(content).to include("category_expiry_days: 7")
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
it "does not overwrite existing file" do
|
|
131
|
+
File.write(config_path, "custom: value")
|
|
132
|
+
|
|
133
|
+
result = described_class.create_default(config_path)
|
|
134
|
+
|
|
135
|
+
expect(result).to be false
|
|
136
|
+
expect(File.read(config_path)).to eq "custom: value"
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
it "can force overwrite with force option" do
|
|
140
|
+
File.write(config_path, "custom: value")
|
|
141
|
+
|
|
142
|
+
result = described_class.create_default(config_path, force: true)
|
|
143
|
+
|
|
144
|
+
expect(result).to be true
|
|
145
|
+
expect(File.read(config_path)).to include("dump_expiry_days: 30")
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
describe "#to_h" do
|
|
150
|
+
it "returns hash representation" do
|
|
151
|
+
config = described_class.new(dump_expiry_days: 45)
|
|
152
|
+
|
|
153
|
+
hash = config.to_h
|
|
154
|
+
|
|
155
|
+
expect(hash).to be_a(Hash)
|
|
156
|
+
expect(hash[:cache][:dump_expiry_days]).to eq 45
|
|
157
|
+
expect(hash[:defaults][:format]).to eq "text"
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
describe "validation" do
|
|
162
|
+
it "clamps dump_expiry_days to valid range" do
|
|
163
|
+
config = described_class.new(dump_expiry_days: -5)
|
|
164
|
+
expect(config.dump_expiry_days).to eq 1
|
|
165
|
+
|
|
166
|
+
config = described_class.new(dump_expiry_days: 400)
|
|
167
|
+
expect(config.dump_expiry_days).to eq 365
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
it "clamps category_expiry_days to valid range" do
|
|
171
|
+
config = described_class.new(category_expiry_days: 0)
|
|
172
|
+
expect(config.category_expiry_days).to eq 1
|
|
173
|
+
|
|
174
|
+
config = described_class.new(category_expiry_days: 100)
|
|
175
|
+
expect(config.category_expiry_days).to eq 90
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
it "clamps default_depth to valid range" do
|
|
179
|
+
config = described_class.new(default_depth: -1)
|
|
180
|
+
expect(config.default_depth).to eq 0
|
|
181
|
+
|
|
182
|
+
config = described_class.new(default_depth: 20)
|
|
183
|
+
expect(config.default_depth).to eq 10
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
it "validates default_format" do
|
|
187
|
+
config = described_class.new(default_format: "invalid")
|
|
188
|
+
expect(config.default_format).to eq "text"
|
|
189
|
+
|
|
190
|
+
config = described_class.new(default_format: "json")
|
|
191
|
+
expect(config.default_format).to eq "json"
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
end
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "spec_helper"
|
|
4
|
+
require "tempfile"
|
|
5
|
+
|
|
6
|
+
RSpec.describe "Wp2txt Constants" do
|
|
7
|
+
describe "Time Constants" do
|
|
8
|
+
it "defines SECONDS_PER_DAY" do
|
|
9
|
+
expect(Wp2txt::SECONDS_PER_DAY).to eq(86_400)
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
it "defines SECONDS_PER_HOUR" do
|
|
13
|
+
expect(Wp2txt::SECONDS_PER_HOUR).to eq(3_600)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
it "defines SECONDS_PER_MINUTE" do
|
|
17
|
+
expect(Wp2txt::SECONDS_PER_MINUTE).to eq(60)
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
describe "Cache Settings" do
|
|
22
|
+
it "defines DEFAULT_DUMP_EXPIRY_DAYS" do
|
|
23
|
+
expect(Wp2txt::DEFAULT_DUMP_EXPIRY_DAYS).to eq(30)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
it "defines DEFAULT_CATEGORY_CACHE_EXPIRY_DAYS" do
|
|
27
|
+
expect(Wp2txt::DEFAULT_CATEGORY_CACHE_EXPIRY_DAYS).to eq(7)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
describe "Network Settings" do
|
|
33
|
+
it "defines DEFAULT_HTTP_TIMEOUT" do
|
|
34
|
+
expect(Wp2txt::DEFAULT_HTTP_TIMEOUT).to eq(30)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
it "defines DEFAULT_PROGRESS_INTERVAL" do
|
|
38
|
+
expect(Wp2txt::DEFAULT_PROGRESS_INTERVAL).to eq(10)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
it "defines INDEX_PROGRESS_THRESHOLD" do
|
|
42
|
+
expect(Wp2txt::INDEX_PROGRESS_THRESHOLD).to eq(500_000)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
it "defines DEFAULT_TOP_N_SECTIONS" do
|
|
46
|
+
expect(Wp2txt::DEFAULT_TOP_N_SECTIONS).to eq(50)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
it "defines RESUME_METADATA_MAX_AGE_DAYS" do
|
|
50
|
+
expect(Wp2txt::RESUME_METADATA_MAX_AGE_DAYS).to eq(7)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
it "defines MAX_HTTP_RETRIES" do
|
|
54
|
+
expect(Wp2txt::MAX_HTTP_RETRIES).to eq(3)
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
describe "Processing Limits" do
|
|
59
|
+
it "defines MAX_NESTING_ITERATIONS" do
|
|
60
|
+
expect(Wp2txt::MAX_NESTING_ITERATIONS).to eq(50_000)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
it "defines DEFAULT_BUFFER_SIZE" do
|
|
64
|
+
expect(Wp2txt::DEFAULT_BUFFER_SIZE).to eq(10_485_760) # 10 MB
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
describe "File Size Units" do
|
|
69
|
+
it "defines BYTES_PER_KB" do
|
|
70
|
+
expect(Wp2txt::BYTES_PER_KB).to eq(1024)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
it "defines BYTES_PER_MB" do
|
|
74
|
+
expect(Wp2txt::BYTES_PER_MB).to eq(1024 * 1024)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
it "defines BYTES_PER_GB" do
|
|
78
|
+
expect(Wp2txt::BYTES_PER_GB).to eq(1024 * 1024 * 1024)
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
describe ".days_to_seconds" do
|
|
83
|
+
it "converts days to seconds" do
|
|
84
|
+
expect(Wp2txt.days_to_seconds(1)).to eq(86_400)
|
|
85
|
+
expect(Wp2txt.days_to_seconds(7)).to eq(7 * 86_400)
|
|
86
|
+
expect(Wp2txt.days_to_seconds(0.5)).to eq(43_200)
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
describe ".file_fresh?" do
|
|
91
|
+
let(:temp_file) { Tempfile.new("test_file") }
|
|
92
|
+
|
|
93
|
+
after { temp_file.unlink }
|
|
94
|
+
|
|
95
|
+
it "returns true for recently created file" do
|
|
96
|
+
expect(Wp2txt.file_fresh?(temp_file.path, 1)).to be true
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
it "returns false for non-existent file" do
|
|
100
|
+
expect(Wp2txt.file_fresh?("/nonexistent/path", 1)).to be false
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
describe ".file_age_days" do
|
|
105
|
+
let(:temp_file) { Tempfile.new("test_file") }
|
|
106
|
+
|
|
107
|
+
after { temp_file.unlink }
|
|
108
|
+
|
|
109
|
+
it "returns age in days for existing file" do
|
|
110
|
+
age = Wp2txt.file_age_days(temp_file.path)
|
|
111
|
+
expect(age).to be_a(Float)
|
|
112
|
+
expect(age).to be >= 0
|
|
113
|
+
expect(age).to be < 1 # File just created
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
it "returns nil for non-existent file" do
|
|
117
|
+
expect(Wp2txt.file_age_days("/nonexistent/path")).to be_nil
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
describe ".format_file_size" do
|
|
122
|
+
it "formats bytes" do
|
|
123
|
+
expect(Wp2txt.format_file_size(500)).to eq("500 B")
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
it "formats kilobytes" do
|
|
127
|
+
expect(Wp2txt.format_file_size(2048)).to eq("2.0 KB")
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
it "formats megabytes" do
|
|
131
|
+
expect(Wp2txt.format_file_size(5_242_880)).to eq("5.0 MB")
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
it "formats gigabytes" do
|
|
135
|
+
expect(Wp2txt.format_file_size(2_147_483_648)).to eq("2.0 GB")
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
end
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "spec_helper"
|
|
4
|
+
require "tempfile"
|
|
5
|
+
require "fileutils"
|
|
6
|
+
|
|
7
|
+
RSpec.describe "Wp2txt FileUtils" do
|
|
8
|
+
include Wp2txt
|
|
9
|
+
|
|
10
|
+
describe "collect_files" do
|
|
11
|
+
let(:temp_dir) { Dir.mktmpdir }
|
|
12
|
+
|
|
13
|
+
after { FileUtils.remove_entry(temp_dir) }
|
|
14
|
+
|
|
15
|
+
it "collects all files in a directory" do
|
|
16
|
+
# Create test files
|
|
17
|
+
File.write(File.join(temp_dir, "file1.txt"), "content1")
|
|
18
|
+
File.write(File.join(temp_dir, "file2.txt"), "content2")
|
|
19
|
+
|
|
20
|
+
files = collect_files(temp_dir)
|
|
21
|
+
expect(files).to include(a_string_ending_with("file1.txt"))
|
|
22
|
+
expect(files).to include(a_string_ending_with("file2.txt"))
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
it "filters files by regex" do
|
|
26
|
+
File.write(File.join(temp_dir, "file1.txt"), "content1")
|
|
27
|
+
File.write(File.join(temp_dir, "file2.rb"), "content2")
|
|
28
|
+
|
|
29
|
+
files = collect_files(temp_dir, /\.txt$/)
|
|
30
|
+
expect(files).to include(a_string_ending_with("file1.txt"))
|
|
31
|
+
expect(files).not_to include(a_string_ending_with("file2.rb"))
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
it "returns sorted list" do
|
|
35
|
+
File.write(File.join(temp_dir, "z_file.txt"), "")
|
|
36
|
+
File.write(File.join(temp_dir, "a_file.txt"), "")
|
|
37
|
+
|
|
38
|
+
files = collect_files(temp_dir, /\.txt$/)
|
|
39
|
+
txt_files = files.select { |f| f.end_with?(".txt") }
|
|
40
|
+
expect(txt_files).to eq(txt_files.sort)
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
describe "correct_separator" do
|
|
45
|
+
it "converts backslashes to forward slashes on non-Windows" do
|
|
46
|
+
skip "Only runs on non-Windows" if RUBY_PLATFORM.index("win32")
|
|
47
|
+
expect(correct_separator("path\\to\\file")).to eq("path/to/file")
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
it "handles arrays of paths" do
|
|
51
|
+
skip "Only runs on non-Windows" if RUBY_PLATFORM.index("win32")
|
|
52
|
+
result = correct_separator(["path\\to\\file1", "path\\to\\file2"])
|
|
53
|
+
expect(result).to eq(["path/to/file1", "path/to/file2"])
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
it "returns nil for nil input" do
|
|
57
|
+
expect(correct_separator(nil)).to be_nil
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
describe "sec_to_str" do
|
|
62
|
+
it "converts seconds to HH:MM:SS format" do
|
|
63
|
+
expect(sec_to_str(3661)).to eq("01:01:01")
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
it "handles zero" do
|
|
67
|
+
expect(sec_to_str(0)).to eq("00:00:00")
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
it "handles large values" do
|
|
71
|
+
expect(sec_to_str(86400)).to eq("24:00:00") # 1 day
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
it "handles nil input" do
|
|
75
|
+
expect(sec_to_str(nil)).to eq("--:--:--")
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
it "formats with leading zeros" do
|
|
79
|
+
expect(sec_to_str(61)).to eq("00:01:01")
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
describe "file_mod" do
|
|
84
|
+
let(:temp_file) { Tempfile.new("test_file") }
|
|
85
|
+
|
|
86
|
+
after do
|
|
87
|
+
temp_file.close
|
|
88
|
+
temp_file.unlink
|
|
89
|
+
File.unlink("temp") if File.exist?("temp")
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
it "modifies file content using block" do
|
|
93
|
+
temp_file.write("original content")
|
|
94
|
+
temp_file.close
|
|
95
|
+
|
|
96
|
+
file_mod(temp_file.path) do |content|
|
|
97
|
+
content.upcase
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
expect(File.read(temp_file.path)).to eq("ORIGINAL CONTENT")
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
it "keeps backup when requested" do
|
|
104
|
+
temp_file.write("original")
|
|
105
|
+
temp_file.close
|
|
106
|
+
|
|
107
|
+
file_mod(temp_file.path, true) do |content|
|
|
108
|
+
"modified"
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
expect(File.exist?(temp_file.path + ".bak")).to be true
|
|
112
|
+
File.unlink(temp_file.path + ".bak")
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
describe "batch_file_mod" do
|
|
117
|
+
let(:temp_dir) { Dir.mktmpdir }
|
|
118
|
+
|
|
119
|
+
after { FileUtils.remove_entry(temp_dir) }
|
|
120
|
+
|
|
121
|
+
it "yields each file in directory" do
|
|
122
|
+
File.write(File.join(temp_dir, "file1.txt"), "")
|
|
123
|
+
File.write(File.join(temp_dir, "file2.txt"), "")
|
|
124
|
+
|
|
125
|
+
files_processed = []
|
|
126
|
+
batch_file_mod(temp_dir) do |file|
|
|
127
|
+
files_processed << File.basename(file)
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
expect(files_processed).to include("file1.txt", "file2.txt")
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
it "yields single file if path is a file" do
|
|
134
|
+
file_path = File.join(temp_dir, "single.txt")
|
|
135
|
+
File.write(file_path, "")
|
|
136
|
+
|
|
137
|
+
files_processed = []
|
|
138
|
+
batch_file_mod(file_path) do |file|
|
|
139
|
+
files_processed << file
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
expect(files_processed).to eq([file_path])
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
describe "rename" do
|
|
147
|
+
let(:temp_dir) { Dir.mktmpdir }
|
|
148
|
+
|
|
149
|
+
after { FileUtils.remove_entry(temp_dir) }
|
|
150
|
+
|
|
151
|
+
it "renames files with extension" do
|
|
152
|
+
file1 = File.join(temp_dir, "test-1")
|
|
153
|
+
file2 = File.join(temp_dir, "test-2")
|
|
154
|
+
File.write(file1, "")
|
|
155
|
+
File.write(file2, "")
|
|
156
|
+
|
|
157
|
+
rename([file1, file2], "txt")
|
|
158
|
+
|
|
159
|
+
expect(File.exist?(File.join(temp_dir, "test-1.txt"))).to be true
|
|
160
|
+
expect(File.exist?(File.join(temp_dir, "test-2.txt"))).to be true
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
it "returns true on success" do
|
|
164
|
+
file1 = File.join(temp_dir, "test-1")
|
|
165
|
+
File.write(file1, "")
|
|
166
|
+
|
|
167
|
+
expect(rename([file1])).to be true
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
end
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Test samples for multilingual content
|
|
4
|
+
module Wp2txt
|
|
5
|
+
module TestSamples
|
|
6
|
+
ENGLISH_ARTICLE = <<~WIKI
|
|
7
|
+
'''Test Article''' is an [[English language|English]] article.
|
|
8
|
+
== Section ==
|
|
9
|
+
[[File:Example.jpg|thumb|A description]]
|
|
10
|
+
[[Category:Tests]]
|
|
11
|
+
[[Category:Examples]]
|
|
12
|
+
WIKI
|
|
13
|
+
|
|
14
|
+
JAPANESE_ARTICLE = <<~WIKI
|
|
15
|
+
'''テスト記事'''は[[日本語]]の記事です。
|
|
16
|
+
== セクション ==
|
|
17
|
+
[[ファイル:Example.jpg|thumb|説明文]]
|
|
18
|
+
[[カテゴリ:テスト]]
|
|
19
|
+
WIKI
|
|
20
|
+
|
|
21
|
+
GERMAN_ARTICLE = <<~WIKI
|
|
22
|
+
'''Testartikel''' ist ein [[Deutsch|deutscher]] Artikel.
|
|
23
|
+
== Abschnitt ==
|
|
24
|
+
[[Datei:Bild.png|miniatur|Beschreibung]]
|
|
25
|
+
[[Kategorie:Test]]
|
|
26
|
+
#WEITERLEITUNG [[Andere Seite]]
|
|
27
|
+
WIKI
|
|
28
|
+
|
|
29
|
+
FRENCH_ARTICLE = <<~WIKI
|
|
30
|
+
'''Article de test''' est un [[Français|article français]].
|
|
31
|
+
== Section ==
|
|
32
|
+
[[Fichier:Image.jpg|vignette|Description]]
|
|
33
|
+
[[Catégorie:Test]]
|
|
34
|
+
#REDIRECTION [[Autre page]]
|
|
35
|
+
WIKI
|
|
36
|
+
|
|
37
|
+
CHINESE_ARTICLE = <<~WIKI
|
|
38
|
+
'''测试文章'''是一个[[中文]]文章。
|
|
39
|
+
== 章节 ==
|
|
40
|
+
[[文件:图片.jpg|缩略图|说明]]
|
|
41
|
+
[[分类:测试]]
|
|
42
|
+
#重定向 [[其他页面]]
|
|
43
|
+
WIKI
|
|
44
|
+
|
|
45
|
+
RUSSIAN_ARTICLE = <<~WIKI
|
|
46
|
+
'''Тестовая статья''' — это [[Русский язык|русская]] статья.
|
|
47
|
+
== Раздел ==
|
|
48
|
+
[[Файл:Изображение.jpg|мини|Описание]]
|
|
49
|
+
[[Категория:Тест]]
|
|
50
|
+
#ПЕРЕНАПРАВЛЕНИЕ [[Другая страница]]
|
|
51
|
+
WIKI
|
|
52
|
+
|
|
53
|
+
KOREAN_ARTICLE = <<~WIKI
|
|
54
|
+
'''테스트 문서'''는 [[한국어]] 문서입니다.
|
|
55
|
+
== 섹션 ==
|
|
56
|
+
[[파일:Example.jpg|섬네일|설명]]
|
|
57
|
+
[[분류:테스트]]
|
|
58
|
+
#넘겨주기 [[다른 문서]]
|
|
59
|
+
WIKI
|
|
60
|
+
|
|
61
|
+
ARABIC_ARTICLE = <<~WIKI
|
|
62
|
+
'''مقالة اختبار''' هي [[اللغة العربية|مقالة عربية]].
|
|
63
|
+
== قسم ==
|
|
64
|
+
[[ملف:صورة.jpg|تصغير|وصف]]
|
|
65
|
+
[[تصنيف:اختبار]]
|
|
66
|
+
#تحويل [[صفحة أخرى]]
|
|
67
|
+
WIKI
|
|
68
|
+
|
|
69
|
+
# Edge cases
|
|
70
|
+
EMOJI_CONTENT = "Text with emoji 😀 and 😀 symbols"
|
|
71
|
+
DEEPLY_NESTED = "{{a|{{b|{{c|{{d|text}}}}}}}}"
|
|
72
|
+
MALFORMED_MARKUP = "[[Unclosed link\n{{Unclosed template"
|
|
73
|
+
|
|
74
|
+
# Complex nested structure
|
|
75
|
+
NESTED_TEMPLATES = <<~WIKI
|
|
76
|
+
{{Infobox person
|
|
77
|
+
|name = Test Person
|
|
78
|
+
|birth_date = {{Birth date|1990|1|15}}
|
|
79
|
+
|occupation = [[Scientist]]
|
|
80
|
+
}}
|
|
81
|
+
WIKI
|
|
82
|
+
|
|
83
|
+
# Table content
|
|
84
|
+
TABLE_CONTENT = <<~WIKI
|
|
85
|
+
{| class="wikitable"
|
|
86
|
+
|-
|
|
87
|
+
! Header 1 !! Header 2
|
|
88
|
+
|-
|
|
89
|
+
| Cell 1 || Cell 2
|
|
90
|
+
|}
|
|
91
|
+
WIKI
|
|
92
|
+
|
|
93
|
+
# Reference content
|
|
94
|
+
REFERENCE_CONTENT = <<~WIKI
|
|
95
|
+
This is text with a reference.<ref>Citation here</ref>
|
|
96
|
+
Another reference.<ref name="test">Named citation</ref>
|
|
97
|
+
WIKI
|
|
98
|
+
|
|
99
|
+
# Multi-line link
|
|
100
|
+
MULTILINE_LINK = <<~WIKI
|
|
101
|
+
[[File:Example.jpg
|
|
102
|
+
|thumb
|
|
103
|
+
|200px
|
|
104
|
+
|A very long caption that spans
|
|
105
|
+
multiple lines]]
|
|
106
|
+
WIKI
|
|
107
|
+
|
|
108
|
+
# === Additional Edge Cases for v2.0.0 ===
|
|
109
|
+
|
|
110
|
+
# Special characters in titles
|
|
111
|
+
SPECIAL_TITLE_ARTICLE = <<~WIKI
|
|
112
|
+
'''C++ (programming language)''' is a [[programming language]].
|
|
113
|
+
'''O'Brien''' was an [[Irish people|Irish]] person.
|
|
114
|
+
'''Rock & Roll''' is a music genre.
|
|
115
|
+
[[Category:Programming languages]]
|
|
116
|
+
WIKI
|
|
117
|
+
|
|
118
|
+
# Very deeply nested templates (10 levels)
|
|
119
|
+
VERY_DEEPLY_NESTED = "{{a|{{b|{{c|{{d|{{e|{{f|{{g|{{h|{{i|{{j|content}}}}}}}}}}}}}}}}}}}}"
|
|
120
|
+
|
|
121
|
+
# Mixed multilingual content with emoji
|
|
122
|
+
MIXED_CONTENT = <<~WIKI
|
|
123
|
+
'''Test''' こんにちは 你好 مرحبا Привет 😀
|
|
124
|
+
== Section セクション ==
|
|
125
|
+
Text with emoji: 😀 💻 ❤
|
|
126
|
+
[[Category:Test]][[カテゴリ:テスト]][[分类:测试]]
|
|
127
|
+
WIKI
|
|
128
|
+
|
|
129
|
+
# Complex wikilinks with pipes and brackets
|
|
130
|
+
COMPLEX_LINKS = <<~WIKI
|
|
131
|
+
[[File:Photo.jpg|thumb|200px|alt=Alt text|Caption with [[nested link]]]]
|
|
132
|
+
[[Article|Display text with '''bold''' and ''italic'']]
|
|
133
|
+
[[Category:Test|Sort key]]
|
|
134
|
+
WIKI
|
|
135
|
+
|
|
136
|
+
# Multiple consecutive templates
|
|
137
|
+
CONSECUTIVE_TEMPLATES = <<~WIKI
|
|
138
|
+
{{Stub}}{{Cleanup}}{{Unreferenced}}
|
|
139
|
+
This article needs work.
|
|
140
|
+
{{Infobox|title=Test}}
|
|
141
|
+
WIKI
|
|
142
|
+
|
|
143
|
+
# HTML entities mixed with character references
|
|
144
|
+
HTML_ENTITIES_MIXED = <<~WIKI
|
|
145
|
+
<tag>&"
|
|
146
|
+
<literal>
|
|
147
|
+
<hex>
|
|
148
|
+
Japanese: 日本語
|
|
149
|
+
WIKI
|
|
150
|
+
|
|
151
|
+
# Horizontal rules (various lengths)
|
|
152
|
+
HORIZONTAL_RULES = <<~WIKI
|
|
153
|
+
Text before
|
|
154
|
+
----
|
|
155
|
+
Text between
|
|
156
|
+
--------
|
|
157
|
+
Text after
|
|
158
|
+
--
|
|
159
|
+
Not a rule
|
|
160
|
+
---
|
|
161
|
+
Also not a rule
|
|
162
|
+
WIKI
|
|
163
|
+
|
|
164
|
+
# Headings with various formatting
|
|
165
|
+
COMPLEX_HEADINGS = <<~WIKI
|
|
166
|
+
== Simple Heading ==
|
|
167
|
+
=== Heading with [[link]] ===
|
|
168
|
+
==== Heading with '''bold''' ====
|
|
169
|
+
===== Heading with trailing space =====
|
|
170
|
+
== 日本語見出し ==
|
|
171
|
+
WIKI
|
|
172
|
+
|
|
173
|
+
# Redirect variations
|
|
174
|
+
REDIRECT_VARIATIONS = <<~WIKI
|
|
175
|
+
#REDIRECT [[Target]]
|
|
176
|
+
#redirect [[lowercase]]
|
|
177
|
+
#REDIRECT[[no space]]
|
|
178
|
+
#REDIRECT [[extra spaces]]
|
|
179
|
+
WIKI
|
|
180
|
+
end
|
|
181
|
+
end
|