wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wp2txt
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.3
4
+ version: 2.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
- autorequire:
9
8
  bindir: bin
10
9
  cert_chain: []
11
- date: 2023-05-13 00:00:00.000000000 Z
10
+ date: 1980-01-02 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: bundler
@@ -52,6 +51,34 @@ dependencies:
52
51
  - - ">="
53
52
  - !ruby/object:Gem::Version
54
53
  version: '0'
54
+ - !ruby/object:Gem::Dependency
55
+ name: simplecov
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: '0'
61
+ type: :development
62
+ prerelease: false
63
+ version_requirements: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: '0'
68
+ - !ruby/object:Gem::Dependency
69
+ name: webmock
70
+ requirement: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: '0'
75
+ type: :development
76
+ prerelease: false
77
+ version_requirements: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - ">="
80
+ - !ruby/object:Gem::Version
81
+ version: '0'
55
82
  - !ruby/object:Gem::Dependency
56
83
  name: htmlentities
57
84
  requirement: !ruby/object:Gem::Requirement
@@ -123,7 +150,7 @@ dependencies:
123
150
  - !ruby/object:Gem::Version
124
151
  version: '0'
125
152
  - !ruby/object:Gem::Dependency
126
- name: ruby-progressbar
153
+ name: tty-progressbar
127
154
  requirement: !ruby/object:Gem::Requirement
128
155
  requirements:
129
156
  - - ">="
@@ -150,6 +177,20 @@ dependencies:
150
177
  - - ">="
151
178
  - !ruby/object:Gem::Version
152
179
  version: '0'
180
+ - !ruby/object:Gem::Dependency
181
+ name: sqlite3
182
+ requirement: !ruby/object:Gem::Requirement
183
+ requirements:
184
+ - - ">="
185
+ - !ruby/object:Gem::Version
186
+ version: '0'
187
+ type: :runtime
188
+ prerelease: false
189
+ version_requirements: !ruby/object:Gem::Requirement
190
+ requirements:
191
+ - - ">="
192
+ - !ruby/object:Gem::Version
193
+ version: '0'
153
194
  description: WP2TXT extracts text and category data from Wikipedia dump files (encoded
154
195
  in XML / compressed with Bzip2), removing MediaWiki markup and other metadata.
155
196
  email:
@@ -162,38 +203,95 @@ files:
162
203
  - ".dockerignore"
163
204
  - ".github/workflows/ci.yml"
164
205
  - ".gitignore"
165
- - ".rubocop.yml"
166
206
  - ".solargraph.yml"
207
+ - CHANGELOG.md
208
+ - DEVELOPMENT.md
209
+ - DEVELOPMENT_ja.md
167
210
  - Dockerfile
168
211
  - Gemfile
169
212
  - LICENSE
170
213
  - README.md
214
+ - README_ja.md
171
215
  - Rakefile
172
216
  - bin/wp2txt
173
- - data/output_samples/testdata_en.txt
174
- - data/output_samples/testdata_en_category.txt
175
- - data/output_samples/testdata_en_summary.txt
176
- - data/output_samples/testdata_ja.txt
177
- - data/output_samples/testdata_ja_category.txt
178
- - data/output_samples/testdata_ja_summary.txt
179
- - data/testdata_en.bz2
180
- - data/testdata_ja.bz2
181
- - image/screenshot.png
182
217
  - image/wp2txt-logo.svg
183
218
  - image/wp2txt.svg
184
219
  - lib/wp2txt.rb
185
220
  - lib/wp2txt/article.rb
221
+ - lib/wp2txt/bz2_validator.rb
222
+ - lib/wp2txt/category_cache.rb
223
+ - lib/wp2txt/cli.rb
224
+ - lib/wp2txt/cli_ui.rb
225
+ - lib/wp2txt/config.rb
226
+ - lib/wp2txt/constants.rb
227
+ - lib/wp2txt/data/html_entities.json
228
+ - lib/wp2txt/data/language_metadata.json
229
+ - lib/wp2txt/data/language_tiers.json
230
+ - lib/wp2txt/data/mediawiki_aliases.json
231
+ - lib/wp2txt/data/template_aliases.json
232
+ - lib/wp2txt/data/wikipedia_entities.json
233
+ - lib/wp2txt/extractor.rb
234
+ - lib/wp2txt/file_utils.rb
235
+ - lib/wp2txt/formatter.rb
236
+ - lib/wp2txt/global_data_cache.rb
237
+ - lib/wp2txt/index_cache.rb
238
+ - lib/wp2txt/magic_words.rb
239
+ - lib/wp2txt/memory_monitor.rb
240
+ - lib/wp2txt/multistream.rb
241
+ - lib/wp2txt/output_writer.rb
242
+ - lib/wp2txt/parser_functions.rb
243
+ - lib/wp2txt/ractor_worker.rb
186
244
  - lib/wp2txt/regex.rb
245
+ - lib/wp2txt/section_extractor.rb
246
+ - lib/wp2txt/stream_processor.rb
247
+ - lib/wp2txt/template_expander.rb
248
+ - lib/wp2txt/text_processing.rb
187
249
  - lib/wp2txt/utils.rb
188
250
  - lib/wp2txt/version.rb
251
+ - scripts/benchmark_regex.rb
252
+ - scripts/fetch_html_entities.rb
253
+ - scripts/fetch_language_metadata.rb
254
+ - scripts/fetch_mediawiki_data.rb
255
+ - scripts/fetch_template_data.rb
256
+ - scripts/profile_memory.rb
257
+ - spec/article_spec.rb
258
+ - spec/auto_download_spec.rb
259
+ - spec/bz2_validator_spec.rb
260
+ - spec/category_cache_spec.rb
261
+ - spec/category_fetcher_spec.rb
262
+ - spec/cleanup_spec.rb
263
+ - spec/cli_options_spec.rb
264
+ - spec/cli_spec.rb
265
+ - spec/config_spec.rb
266
+ - spec/constants_spec.rb
267
+ - spec/file_utils_spec.rb
268
+ - spec/fixtures/samples.rb
269
+ - spec/formatter_sections_spec.rb
270
+ - spec/global_data_cache_spec.rb
271
+ - spec/index_cache_spec.rb
272
+ - spec/integration_spec.rb
273
+ - spec/magic_words_spec.rb
274
+ - spec/markers_spec.rb
275
+ - spec/memory_monitor_spec.rb
276
+ - spec/multistream_spec.rb
277
+ - spec/output_writer_spec.rb
278
+ - spec/parser_functions_spec.rb
279
+ - spec/ractor_worker_spec.rb
280
+ - spec/regex_spec.rb
281
+ - spec/section_extractor_spec.rb
189
282
  - spec/spec_helper.rb
283
+ - spec/stream_processor_spec.rb
284
+ - spec/template_data_spec.rb
285
+ - spec/template_expander_spec.rb
286
+ - spec/template_processing_spec.rb
287
+ - spec/text_processing_spec.rb
190
288
  - spec/utils_spec.rb
289
+ - spec/wp2txt_spec.rb
191
290
  - wp2txt.gemspec
192
291
  homepage: https://github.com/yohasebe/wp2txt
193
292
  licenses:
194
293
  - MIT
195
294
  metadata: {}
196
- post_install_message:
197
295
  rdoc_options: []
198
296
  require_paths:
199
297
  - lib
@@ -201,18 +299,48 @@ required_ruby_version: !ruby/object:Gem::Requirement
201
299
  requirements:
202
300
  - - ">="
203
301
  - !ruby/object:Gem::Version
204
- version: '2.6'
302
+ version: '3.0'
205
303
  required_rubygems_version: !ruby/object:Gem::Requirement
206
304
  requirements:
207
305
  - - ">="
208
306
  - !ruby/object:Gem::Version
209
307
  version: '0'
210
308
  requirements: []
211
- rubygems_version: 3.4.12
212
- signing_key:
309
+ rubygems_version: 4.0.3
213
310
  specification_version: 4
214
311
  summary: A command-line toolkit to extract text content and category data from Wikipedia
215
312
  dump files
216
313
  test_files:
314
+ - spec/article_spec.rb
315
+ - spec/auto_download_spec.rb
316
+ - spec/bz2_validator_spec.rb
317
+ - spec/category_cache_spec.rb
318
+ - spec/category_fetcher_spec.rb
319
+ - spec/cleanup_spec.rb
320
+ - spec/cli_options_spec.rb
321
+ - spec/cli_spec.rb
322
+ - spec/config_spec.rb
323
+ - spec/constants_spec.rb
324
+ - spec/file_utils_spec.rb
325
+ - spec/fixtures/samples.rb
326
+ - spec/formatter_sections_spec.rb
327
+ - spec/global_data_cache_spec.rb
328
+ - spec/index_cache_spec.rb
329
+ - spec/integration_spec.rb
330
+ - spec/magic_words_spec.rb
331
+ - spec/markers_spec.rb
332
+ - spec/memory_monitor_spec.rb
333
+ - spec/multistream_spec.rb
334
+ - spec/output_writer_spec.rb
335
+ - spec/parser_functions_spec.rb
336
+ - spec/ractor_worker_spec.rb
337
+ - spec/regex_spec.rb
338
+ - spec/section_extractor_spec.rb
217
339
  - spec/spec_helper.rb
340
+ - spec/stream_processor_spec.rb
341
+ - spec/template_data_spec.rb
342
+ - spec/template_expander_spec.rb
343
+ - spec/template_processing_spec.rb
344
+ - spec/text_processing_spec.rb
218
345
  - spec/utils_spec.rb
346
+ - spec/wp2txt_spec.rb
data/.rubocop.yml DELETED
@@ -1,80 +0,0 @@
1
- AllCops:
2
- NewCops: disable
3
- SuggestExtensions: false
4
- TargetRubyVersion: 2.6
5
-
6
- Documentation:
7
- Enabled: false
8
-
9
- Naming/AccessorMethodName:
10
- Enabled: false
11
-
12
- Naming/VariableNumber:
13
- Enabled: false
14
-
15
- Naming/FileName:
16
- Enabled: false
17
-
18
- Security/MarshalLoad:
19
- Enabled: false
20
-
21
- Security/Open:
22
- Enabled: false
23
-
24
- Layout/EndOfLine:
25
- Enabled: False
26
-
27
- Style/FormatStringToken:
28
- Enabled: false
29
-
30
- Style/ClassVars:
31
- Enabled: false
32
-
33
- Style/OptionalBooleanParameter:
34
- Enabled: false
35
-
36
- Style/StringConcatenation:
37
- Enabled: false
38
-
39
- Style/PerlBackrefs:
40
- Enabled: false
41
-
42
- Style/StringLiterals:
43
- Enabled: false
44
-
45
- Style/StringLiteralsInInterpolation:
46
- Enabled: true
47
- EnforcedStyle: double_quotes
48
-
49
- Style/WordArray:
50
- Enabled: false
51
-
52
- Style/EvalWithLocation:
53
- Enabled: false
54
-
55
- Layout/LineLength:
56
- Max: 400
57
-
58
- Metrics/MethodLength:
59
- Max: 200
60
-
61
- Metrics/BlockLength:
62
- Max: 200
63
-
64
- Metrics/AbcSize:
65
- Max: 200
66
-
67
- Metrics/PerceivedComplexity:
68
- Max: 60
69
-
70
- Metrics/ClassLength:
71
- Max: 800
72
-
73
- Metrics/CyclomaticComplexity:
74
- Max: 60
75
-
76
- Metrics/ParameterLists:
77
- Max: 8
78
-
79
- Metrics/ModuleLength:
80
- Max: 600