wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
metadata
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: wp2txt
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version:
|
|
4
|
+
version: 2.1.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Yoichiro Hasebe
|
|
8
|
-
autorequire:
|
|
9
8
|
bindir: bin
|
|
10
9
|
cert_chain: []
|
|
11
|
-
date:
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
12
11
|
dependencies:
|
|
13
12
|
- !ruby/object:Gem::Dependency
|
|
14
13
|
name: bundler
|
|
@@ -52,6 +51,34 @@ dependencies:
|
|
|
52
51
|
- - ">="
|
|
53
52
|
- !ruby/object:Gem::Version
|
|
54
53
|
version: '0'
|
|
54
|
+
- !ruby/object:Gem::Dependency
|
|
55
|
+
name: simplecov
|
|
56
|
+
requirement: !ruby/object:Gem::Requirement
|
|
57
|
+
requirements:
|
|
58
|
+
- - ">="
|
|
59
|
+
- !ruby/object:Gem::Version
|
|
60
|
+
version: '0'
|
|
61
|
+
type: :development
|
|
62
|
+
prerelease: false
|
|
63
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
64
|
+
requirements:
|
|
65
|
+
- - ">="
|
|
66
|
+
- !ruby/object:Gem::Version
|
|
67
|
+
version: '0'
|
|
68
|
+
- !ruby/object:Gem::Dependency
|
|
69
|
+
name: webmock
|
|
70
|
+
requirement: !ruby/object:Gem::Requirement
|
|
71
|
+
requirements:
|
|
72
|
+
- - ">="
|
|
73
|
+
- !ruby/object:Gem::Version
|
|
74
|
+
version: '0'
|
|
75
|
+
type: :development
|
|
76
|
+
prerelease: false
|
|
77
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
78
|
+
requirements:
|
|
79
|
+
- - ">="
|
|
80
|
+
- !ruby/object:Gem::Version
|
|
81
|
+
version: '0'
|
|
55
82
|
- !ruby/object:Gem::Dependency
|
|
56
83
|
name: htmlentities
|
|
57
84
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -123,7 +150,7 @@ dependencies:
|
|
|
123
150
|
- !ruby/object:Gem::Version
|
|
124
151
|
version: '0'
|
|
125
152
|
- !ruby/object:Gem::Dependency
|
|
126
|
-
name:
|
|
153
|
+
name: tty-progressbar
|
|
127
154
|
requirement: !ruby/object:Gem::Requirement
|
|
128
155
|
requirements:
|
|
129
156
|
- - ">="
|
|
@@ -150,6 +177,20 @@ dependencies:
|
|
|
150
177
|
- - ">="
|
|
151
178
|
- !ruby/object:Gem::Version
|
|
152
179
|
version: '0'
|
|
180
|
+
- !ruby/object:Gem::Dependency
|
|
181
|
+
name: sqlite3
|
|
182
|
+
requirement: !ruby/object:Gem::Requirement
|
|
183
|
+
requirements:
|
|
184
|
+
- - ">="
|
|
185
|
+
- !ruby/object:Gem::Version
|
|
186
|
+
version: '0'
|
|
187
|
+
type: :runtime
|
|
188
|
+
prerelease: false
|
|
189
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
190
|
+
requirements:
|
|
191
|
+
- - ">="
|
|
192
|
+
- !ruby/object:Gem::Version
|
|
193
|
+
version: '0'
|
|
153
194
|
description: WP2TXT extracts text and category data from Wikipedia dump files (encoded
|
|
154
195
|
in XML / compressed with Bzip2), removing MediaWiki markup and other metadata.
|
|
155
196
|
email:
|
|
@@ -162,38 +203,95 @@ files:
|
|
|
162
203
|
- ".dockerignore"
|
|
163
204
|
- ".github/workflows/ci.yml"
|
|
164
205
|
- ".gitignore"
|
|
165
|
-
- ".rubocop.yml"
|
|
166
206
|
- ".solargraph.yml"
|
|
207
|
+
- CHANGELOG.md
|
|
208
|
+
- DEVELOPMENT.md
|
|
209
|
+
- DEVELOPMENT_ja.md
|
|
167
210
|
- Dockerfile
|
|
168
211
|
- Gemfile
|
|
169
212
|
- LICENSE
|
|
170
213
|
- README.md
|
|
214
|
+
- README_ja.md
|
|
171
215
|
- Rakefile
|
|
172
216
|
- bin/wp2txt
|
|
173
|
-
- data/output_samples/testdata_en.txt
|
|
174
|
-
- data/output_samples/testdata_en_category.txt
|
|
175
|
-
- data/output_samples/testdata_en_summary.txt
|
|
176
|
-
- data/output_samples/testdata_ja.txt
|
|
177
|
-
- data/output_samples/testdata_ja_category.txt
|
|
178
|
-
- data/output_samples/testdata_ja_summary.txt
|
|
179
|
-
- data/testdata_en.bz2
|
|
180
|
-
- data/testdata_ja.bz2
|
|
181
|
-
- image/screenshot.png
|
|
182
217
|
- image/wp2txt-logo.svg
|
|
183
218
|
- image/wp2txt.svg
|
|
184
219
|
- lib/wp2txt.rb
|
|
185
220
|
- lib/wp2txt/article.rb
|
|
221
|
+
- lib/wp2txt/bz2_validator.rb
|
|
222
|
+
- lib/wp2txt/category_cache.rb
|
|
223
|
+
- lib/wp2txt/cli.rb
|
|
224
|
+
- lib/wp2txt/cli_ui.rb
|
|
225
|
+
- lib/wp2txt/config.rb
|
|
226
|
+
- lib/wp2txt/constants.rb
|
|
227
|
+
- lib/wp2txt/data/html_entities.json
|
|
228
|
+
- lib/wp2txt/data/language_metadata.json
|
|
229
|
+
- lib/wp2txt/data/language_tiers.json
|
|
230
|
+
- lib/wp2txt/data/mediawiki_aliases.json
|
|
231
|
+
- lib/wp2txt/data/template_aliases.json
|
|
232
|
+
- lib/wp2txt/data/wikipedia_entities.json
|
|
233
|
+
- lib/wp2txt/extractor.rb
|
|
234
|
+
- lib/wp2txt/file_utils.rb
|
|
235
|
+
- lib/wp2txt/formatter.rb
|
|
236
|
+
- lib/wp2txt/global_data_cache.rb
|
|
237
|
+
- lib/wp2txt/index_cache.rb
|
|
238
|
+
- lib/wp2txt/magic_words.rb
|
|
239
|
+
- lib/wp2txt/memory_monitor.rb
|
|
240
|
+
- lib/wp2txt/multistream.rb
|
|
241
|
+
- lib/wp2txt/output_writer.rb
|
|
242
|
+
- lib/wp2txt/parser_functions.rb
|
|
243
|
+
- lib/wp2txt/ractor_worker.rb
|
|
186
244
|
- lib/wp2txt/regex.rb
|
|
245
|
+
- lib/wp2txt/section_extractor.rb
|
|
246
|
+
- lib/wp2txt/stream_processor.rb
|
|
247
|
+
- lib/wp2txt/template_expander.rb
|
|
248
|
+
- lib/wp2txt/text_processing.rb
|
|
187
249
|
- lib/wp2txt/utils.rb
|
|
188
250
|
- lib/wp2txt/version.rb
|
|
251
|
+
- scripts/benchmark_regex.rb
|
|
252
|
+
- scripts/fetch_html_entities.rb
|
|
253
|
+
- scripts/fetch_language_metadata.rb
|
|
254
|
+
- scripts/fetch_mediawiki_data.rb
|
|
255
|
+
- scripts/fetch_template_data.rb
|
|
256
|
+
- scripts/profile_memory.rb
|
|
257
|
+
- spec/article_spec.rb
|
|
258
|
+
- spec/auto_download_spec.rb
|
|
259
|
+
- spec/bz2_validator_spec.rb
|
|
260
|
+
- spec/category_cache_spec.rb
|
|
261
|
+
- spec/category_fetcher_spec.rb
|
|
262
|
+
- spec/cleanup_spec.rb
|
|
263
|
+
- spec/cli_options_spec.rb
|
|
264
|
+
- spec/cli_spec.rb
|
|
265
|
+
- spec/config_spec.rb
|
|
266
|
+
- spec/constants_spec.rb
|
|
267
|
+
- spec/file_utils_spec.rb
|
|
268
|
+
- spec/fixtures/samples.rb
|
|
269
|
+
- spec/formatter_sections_spec.rb
|
|
270
|
+
- spec/global_data_cache_spec.rb
|
|
271
|
+
- spec/index_cache_spec.rb
|
|
272
|
+
- spec/integration_spec.rb
|
|
273
|
+
- spec/magic_words_spec.rb
|
|
274
|
+
- spec/markers_spec.rb
|
|
275
|
+
- spec/memory_monitor_spec.rb
|
|
276
|
+
- spec/multistream_spec.rb
|
|
277
|
+
- spec/output_writer_spec.rb
|
|
278
|
+
- spec/parser_functions_spec.rb
|
|
279
|
+
- spec/ractor_worker_spec.rb
|
|
280
|
+
- spec/regex_spec.rb
|
|
281
|
+
- spec/section_extractor_spec.rb
|
|
189
282
|
- spec/spec_helper.rb
|
|
283
|
+
- spec/stream_processor_spec.rb
|
|
284
|
+
- spec/template_data_spec.rb
|
|
285
|
+
- spec/template_expander_spec.rb
|
|
286
|
+
- spec/template_processing_spec.rb
|
|
287
|
+
- spec/text_processing_spec.rb
|
|
190
288
|
- spec/utils_spec.rb
|
|
289
|
+
- spec/wp2txt_spec.rb
|
|
191
290
|
- wp2txt.gemspec
|
|
192
291
|
homepage: https://github.com/yohasebe/wp2txt
|
|
193
292
|
licenses:
|
|
194
293
|
- MIT
|
|
195
294
|
metadata: {}
|
|
196
|
-
post_install_message:
|
|
197
295
|
rdoc_options: []
|
|
198
296
|
require_paths:
|
|
199
297
|
- lib
|
|
@@ -201,18 +299,48 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
201
299
|
requirements:
|
|
202
300
|
- - ">="
|
|
203
301
|
- !ruby/object:Gem::Version
|
|
204
|
-
version: '
|
|
302
|
+
version: '3.0'
|
|
205
303
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
206
304
|
requirements:
|
|
207
305
|
- - ">="
|
|
208
306
|
- !ruby/object:Gem::Version
|
|
209
307
|
version: '0'
|
|
210
308
|
requirements: []
|
|
211
|
-
rubygems_version:
|
|
212
|
-
signing_key:
|
|
309
|
+
rubygems_version: 4.0.3
|
|
213
310
|
specification_version: 4
|
|
214
311
|
summary: A command-line toolkit to extract text content and category data from Wikipedia
|
|
215
312
|
dump files
|
|
216
313
|
test_files:
|
|
314
|
+
- spec/article_spec.rb
|
|
315
|
+
- spec/auto_download_spec.rb
|
|
316
|
+
- spec/bz2_validator_spec.rb
|
|
317
|
+
- spec/category_cache_spec.rb
|
|
318
|
+
- spec/category_fetcher_spec.rb
|
|
319
|
+
- spec/cleanup_spec.rb
|
|
320
|
+
- spec/cli_options_spec.rb
|
|
321
|
+
- spec/cli_spec.rb
|
|
322
|
+
- spec/config_spec.rb
|
|
323
|
+
- spec/constants_spec.rb
|
|
324
|
+
- spec/file_utils_spec.rb
|
|
325
|
+
- spec/fixtures/samples.rb
|
|
326
|
+
- spec/formatter_sections_spec.rb
|
|
327
|
+
- spec/global_data_cache_spec.rb
|
|
328
|
+
- spec/index_cache_spec.rb
|
|
329
|
+
- spec/integration_spec.rb
|
|
330
|
+
- spec/magic_words_spec.rb
|
|
331
|
+
- spec/markers_spec.rb
|
|
332
|
+
- spec/memory_monitor_spec.rb
|
|
333
|
+
- spec/multistream_spec.rb
|
|
334
|
+
- spec/output_writer_spec.rb
|
|
335
|
+
- spec/parser_functions_spec.rb
|
|
336
|
+
- spec/ractor_worker_spec.rb
|
|
337
|
+
- spec/regex_spec.rb
|
|
338
|
+
- spec/section_extractor_spec.rb
|
|
217
339
|
- spec/spec_helper.rb
|
|
340
|
+
- spec/stream_processor_spec.rb
|
|
341
|
+
- spec/template_data_spec.rb
|
|
342
|
+
- spec/template_expander_spec.rb
|
|
343
|
+
- spec/template_processing_spec.rb
|
|
344
|
+
- spec/text_processing_spec.rb
|
|
218
345
|
- spec/utils_spec.rb
|
|
346
|
+
- spec/wp2txt_spec.rb
|
data/.rubocop.yml
DELETED
|
@@ -1,80 +0,0 @@
|
|
|
1
|
-
AllCops:
|
|
2
|
-
NewCops: disable
|
|
3
|
-
SuggestExtensions: false
|
|
4
|
-
TargetRubyVersion: 2.6
|
|
5
|
-
|
|
6
|
-
Documentation:
|
|
7
|
-
Enabled: false
|
|
8
|
-
|
|
9
|
-
Naming/AccessorMethodName:
|
|
10
|
-
Enabled: false
|
|
11
|
-
|
|
12
|
-
Naming/VariableNumber:
|
|
13
|
-
Enabled: false
|
|
14
|
-
|
|
15
|
-
Naming/FileName:
|
|
16
|
-
Enabled: false
|
|
17
|
-
|
|
18
|
-
Security/MarshalLoad:
|
|
19
|
-
Enabled: false
|
|
20
|
-
|
|
21
|
-
Security/Open:
|
|
22
|
-
Enabled: false
|
|
23
|
-
|
|
24
|
-
Layout/EndOfLine:
|
|
25
|
-
Enabled: False
|
|
26
|
-
|
|
27
|
-
Style/FormatStringToken:
|
|
28
|
-
Enabled: false
|
|
29
|
-
|
|
30
|
-
Style/ClassVars:
|
|
31
|
-
Enabled: false
|
|
32
|
-
|
|
33
|
-
Style/OptionalBooleanParameter:
|
|
34
|
-
Enabled: false
|
|
35
|
-
|
|
36
|
-
Style/StringConcatenation:
|
|
37
|
-
Enabled: false
|
|
38
|
-
|
|
39
|
-
Style/PerlBackrefs:
|
|
40
|
-
Enabled: false
|
|
41
|
-
|
|
42
|
-
Style/StringLiterals:
|
|
43
|
-
Enabled: false
|
|
44
|
-
|
|
45
|
-
Style/StringLiteralsInInterpolation:
|
|
46
|
-
Enabled: true
|
|
47
|
-
EnforcedStyle: double_quotes
|
|
48
|
-
|
|
49
|
-
Style/WordArray:
|
|
50
|
-
Enabled: false
|
|
51
|
-
|
|
52
|
-
Style/EvalWithLocation:
|
|
53
|
-
Enabled: false
|
|
54
|
-
|
|
55
|
-
Layout/LineLength:
|
|
56
|
-
Max: 400
|
|
57
|
-
|
|
58
|
-
Metrics/MethodLength:
|
|
59
|
-
Max: 200
|
|
60
|
-
|
|
61
|
-
Metrics/BlockLength:
|
|
62
|
-
Max: 200
|
|
63
|
-
|
|
64
|
-
Metrics/AbcSize:
|
|
65
|
-
Max: 200
|
|
66
|
-
|
|
67
|
-
Metrics/PerceivedComplexity:
|
|
68
|
-
Max: 60
|
|
69
|
-
|
|
70
|
-
Metrics/ClassLength:
|
|
71
|
-
Max: 800
|
|
72
|
-
|
|
73
|
-
Metrics/CyclomaticComplexity:
|
|
74
|
-
Max: 60
|
|
75
|
-
|
|
76
|
-
Metrics/ParameterLists:
|
|
77
|
-
Max: 8
|
|
78
|
-
|
|
79
|
-
Metrics/ModuleLength:
|
|
80
|
-
Max: 600
|