wp2txt 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +17 -11
- data/lib/wp2txt/utils.rb +22 -21
- data/lib/wp2txt/version.rb +1 -1
- data/tags +58 -0
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bb540f4f17f7825786d110245c235ac556e3e64cedb17efae3e0591887425801
|
4
|
+
data.tar.gz: 479c357f7ba117ae10d9a5a04d24ce3aca2e54d942a156b02eb932c1aab55c8b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 940d47d2c8bce06029fe76e3b3744563d089e26e297e5224b36e65d815295da57117eae84cbb43abeddf2f2c052e2a987d668cba52c7af6148e935b571b6d403
|
7
|
+
data.tar.gz: 8ce76523a3bf181ac7a5da11f088dd14cfb1e1d7ac0d5239832db52968d183db16a3ece6074513b634eebe0e5ca28ceea945eaef6542ecb1933266caf4e89a3c
|
data/README.md
CHANGED
@@ -6,20 +6,26 @@ A command-line toolkit to extract text content and category data from Wikipedia
|
|
6
6
|
|
7
7
|
WP2TXT extracts text and category data from Wikipedia dump files (encoded in XML / compressed with Bzip2), removing MediaWiki markup and other metadata.
|
8
8
|
|
9
|
-
|
9
|
+
## Changelog
|
10
10
|
|
11
|
-
|
12
|
-
|
13
|
-
|
11
|
+
**November 2022**
|
12
|
+
|
13
|
+
- Code added to suppress "Invalid byte sequence error" when an ilegal UTF-8 character is input.
|
14
|
+
|
15
|
+
**August 2022**
|
16
|
+
|
17
|
+
- A new option `--category-only` has been added. When this option is enabled, only the title and category information of the article is extracted.
|
18
|
+
- A new option `--summary-only` has been added. If this option is enabled, only the title, category information, and opening paragraphs of the article will be extracted.
|
19
|
+
- Text conversion with the current version of WP2TXT is *more than 2x times faster* than the previous version due to parallel processing of multiple files (the rate of speedup depends on the CPU cores used for processing).
|
14
20
|
|
15
21
|
## Screenshot
|
16
22
|
|
17
|
-
<img src='https://raw.githubusercontent.com/yohasebe/wp2txt/master/image/screenshot.png' width="
|
23
|
+
<img src='https://raw.githubusercontent.com/yohasebe/wp2txt/master/image/screenshot.png' width="800" />
|
18
24
|
|
19
|
-
**Environment**
|
25
|
+
**Environment**
|
20
26
|
|
21
27
|
- WP2TXT 1.0.1
|
22
|
-
- MacBook Pro (2021 Apple M1 Pro)
|
28
|
+
- MacBook Pro (2021 Apple M1 Pro)
|
23
29
|
- enwiki-20220720-pages-articles.xml.bz2 (19.98 GB)
|
24
30
|
|
25
31
|
In the above environment, the process (decompression, splitting, extraction, and conversion) to obtain the plain text data of the English Wikipedia takes less than 1.5 hours.
|
@@ -34,7 +40,7 @@ In the above environment, the process (decompression, splitting, extraction, and
|
|
34
40
|
|
35
41
|
## Preparation
|
36
42
|
|
37
|
-
### For MacOS
|
43
|
+
### For MacOS and Linux
|
38
44
|
|
39
45
|
WP2TXT requires that one of the following commands be installed on the system in order to decompress `bz2` files:
|
40
46
|
|
@@ -184,11 +190,11 @@ The author will appreciate your mentioning one of these in your research.
|
|
184
190
|
Or use this BibTeX entry:
|
185
191
|
|
186
192
|
```
|
187
|
-
@misc{
|
193
|
+
@misc{wp2txt_2022,
|
188
194
|
author = {Yoichiro Hasebe},
|
189
195
|
title = {WP2TXT: A command-line toolkit to extract text content and category data from Wikipedia dump files},
|
190
|
-
url = {https://github.com/yohasebe/wp2txt}
|
191
|
-
year = {2022}
|
196
|
+
url = {https://github.com/yohasebe/wp2txt},
|
197
|
+
year = {2022}
|
192
198
|
}
|
193
199
|
```
|
194
200
|
|
data/lib/wp2txt/utils.rb
CHANGED
@@ -41,7 +41,7 @@ $in_table_regex2 = Regexp.new('^\|\}.*?$')
|
|
41
41
|
$in_unordered_regex = Regexp.new('^\*')
|
42
42
|
$in_ordered_regex = Regexp.new('^\#')
|
43
43
|
$in_pre_regex = Regexp.new('^ ')
|
44
|
-
$in_definition_regex = Regexp.new('^[\;\:]')
|
44
|
+
$in_definition_regex = Regexp.new('^[\;\:]')
|
45
45
|
$blank_line_regex = Regexp.new('^\s*$')
|
46
46
|
$redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
|
47
47
|
$remove_tag_regex = Regexp.new("\<[^\<\>]*\>")
|
@@ -98,11 +98,12 @@ $cleanup_regex_08 = Regexp.new('\n\n\n+', Regexp::MULTILINE)
|
|
98
98
|
module Wp2txt
|
99
99
|
|
100
100
|
def convert_characters!(text, has_retried = false)
|
101
|
-
begin
|
102
|
-
text << ""
|
101
|
+
begin
|
102
|
+
text << ""
|
103
103
|
chrref_to_utf!(text)
|
104
104
|
special_chr!(text)
|
105
|
-
|
105
|
+
text.encode!("UTF-8", "UTF-8", invalid: :replace, replace: "")
|
106
|
+
|
106
107
|
rescue # detect invalid byte sequence in UTF-8
|
107
108
|
if has_retried
|
108
109
|
puts "invalid byte sequence detected"
|
@@ -112,20 +113,20 @@ module Wp2txt
|
|
112
113
|
end
|
113
114
|
exit
|
114
115
|
else
|
115
|
-
text.encode!("UTF-16")
|
116
|
-
text.encode!("UTF-
|
116
|
+
text.encode!("UTF-16", "UTF-16", invalid: :replace, replace: "")
|
117
|
+
text.encode!("UTF-16", "UTF-16", invalid: :replace, replace: "")
|
117
118
|
convert_characters!(text, true)
|
118
119
|
end
|
119
120
|
end
|
120
121
|
end
|
121
|
-
|
122
|
+
|
122
123
|
def format_wiki!(text, has_retried = false)
|
123
124
|
remove_complex!(text)
|
124
125
|
|
125
126
|
escape_nowiki!(text)
|
126
127
|
process_interwiki_links!(text)
|
127
128
|
process_external_links!(text)
|
128
|
-
unescape_nowiki!(text)
|
129
|
+
unescape_nowiki!(text)
|
129
130
|
remove_directive!(text)
|
130
131
|
remove_emphasis!(text)
|
131
132
|
mndash!(text)
|
@@ -135,7 +136,7 @@ module Wp2txt
|
|
135
136
|
remove_templates!(text) unless $leave_inline_template
|
136
137
|
remove_table!(text) unless $leave_table
|
137
138
|
end
|
138
|
-
|
139
|
+
|
139
140
|
def cleanup!(text)
|
140
141
|
text.gsub!($cleanup_regex_01){""}
|
141
142
|
text.gsub!($cleanup_regex_02){""}
|
@@ -150,7 +151,7 @@ module Wp2txt
|
|
150
151
|
end
|
151
152
|
|
152
153
|
#################### parser for nested structure ####################
|
153
|
-
|
154
|
+
|
154
155
|
def process_nested_structure(scanner, left, right, &block)
|
155
156
|
test = false
|
156
157
|
buffer = ""
|
@@ -195,7 +196,7 @@ module Wp2txt
|
|
195
196
|
rescue => e
|
196
197
|
return scanner.string
|
197
198
|
end
|
198
|
-
end
|
199
|
+
end
|
199
200
|
|
200
201
|
#################### methods used from format_wiki ####################
|
201
202
|
def escape_nowiki!(str)
|
@@ -218,11 +219,11 @@ module Wp2txt
|
|
218
219
|
@nowikis[obj_id]
|
219
220
|
end
|
220
221
|
end
|
221
|
-
|
222
|
+
|
222
223
|
def process_interwiki_links!(str)
|
223
224
|
scanner = StringScanner.new(str)
|
224
225
|
result = process_nested_structure(scanner, "[[", "]]") do |contents|
|
225
|
-
parts = contents.split("|")
|
226
|
+
parts = contents.split("|")
|
226
227
|
case parts.size
|
227
228
|
when 1
|
228
229
|
parts.first || ""
|
@@ -265,7 +266,7 @@ module Wp2txt
|
|
265
266
|
end
|
266
267
|
str.replace(result)
|
267
268
|
end
|
268
|
-
|
269
|
+
|
269
270
|
def remove_table!(str)
|
270
271
|
scanner = StringScanner.new(str)
|
271
272
|
result = process_nested_structure(scanner, "{|", "|}") do |contents|
|
@@ -273,7 +274,7 @@ module Wp2txt
|
|
273
274
|
end
|
274
275
|
str.replace(result)
|
275
276
|
end
|
276
|
-
|
277
|
+
|
277
278
|
def special_chr!(str)
|
278
279
|
str.replace $html_decoder.decode(str)
|
279
280
|
end
|
@@ -316,7 +317,7 @@ module Wp2txt
|
|
316
317
|
end
|
317
318
|
return true
|
318
319
|
end
|
319
|
-
|
320
|
+
|
320
321
|
def mndash!(str)
|
321
322
|
str.gsub!($mndash_regex, "–")
|
322
323
|
end
|
@@ -347,7 +348,7 @@ module Wp2txt
|
|
347
348
|
str.gsub!($complex_regex_04){""}
|
348
349
|
str.gsub!($complex_regex_05){""}
|
349
350
|
end
|
350
|
-
|
351
|
+
|
351
352
|
def make_reference!(str)
|
352
353
|
str.gsub!($make_reference_regex_a){"\n"}
|
353
354
|
str.gsub!($make_reference_regex_b){""}
|
@@ -413,7 +414,7 @@ module Wp2txt
|
|
413
414
|
File.rename(file_path, file_path + ".bak")
|
414
415
|
File.rename("temp", file_path)
|
415
416
|
File.unlink(file_path + ".bak") unless backup
|
416
|
-
end
|
417
|
+
end
|
417
418
|
|
418
419
|
# modify files under a directry (recursive)
|
419
420
|
def batch_file_mod(dir_path, &block)
|
@@ -421,7 +422,7 @@ module Wp2txt
|
|
421
422
|
collect_files(dir_path).each do |file|
|
422
423
|
yield file if FileTest.file?(file)
|
423
424
|
end
|
424
|
-
else
|
425
|
+
else
|
425
426
|
yield dir_path if FileTest.file?(dir_path)
|
426
427
|
end
|
427
428
|
end
|
@@ -445,9 +446,9 @@ module Wp2txt
|
|
445
446
|
end
|
446
447
|
end
|
447
448
|
|
448
|
-
def rename(files, ext = "txt")
|
449
|
+
def rename(files, ext = "txt")
|
449
450
|
# num of digits necessary to name the last file generated
|
450
|
-
maxwidth = 0
|
451
|
+
maxwidth = 0
|
451
452
|
|
452
453
|
files.each do |f|
|
453
454
|
width = f.slice(/\-(\d+)\z/, 1).to_s.length.to_i
|
data/lib/wp2txt/version.rb
CHANGED
data/tags
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
!_TAG_FILE_FORMAT 2 /extended format; --format=1 will not append ;" to lines/
|
2
|
+
!_TAG_FILE_SORTED 1 /0=unsorted, 1=sorted, 2=foldcase/
|
3
|
+
!_TAG_PROGRAM_AUTHOR Darren Hiebert /dhiebert@users.sourceforge.net/
|
4
|
+
!_TAG_PROGRAM_NAME Exuberant Ctags //
|
5
|
+
!_TAG_PROGRAM_URL http://ctags.sourceforge.net /official site/
|
6
|
+
!_TAG_PROGRAM_VERSION 5.8 //
|
7
|
+
Article lib/wp2txt/article.rb /^ class Article$/;" c class:Wp2txt
|
8
|
+
Runner lib/wp2txt.rb /^ class Runner$/;" c class:Wp2txt.Splitter.file_size
|
9
|
+
Splitter lib/wp2txt.rb /^ class Splitter$/;" c class:Wp2txt
|
10
|
+
Wp2txt lib/wp2txt.rb /^module Wp2txt$/;" m
|
11
|
+
Wp2txt lib/wp2txt/article.rb /^module Wp2txt$/;" m
|
12
|
+
Wp2txt lib/wp2txt/utils.rb /^module Wp2txt$/;" m
|
13
|
+
Wp2txt lib/wp2txt/version.rb /^module Wp2txt$/;" m
|
14
|
+
batch_file_mod lib/wp2txt/utils.rb /^ def batch_file_mod(dir_path, &block)$/;" f
|
15
|
+
chrref_to_utf! lib/wp2txt/utils.rb /^ def chrref_to_utf!(num_str)$/;" f
|
16
|
+
cleanup! lib/wp2txt/utils.rb /^ def cleanup!(text)$/;" f
|
17
|
+
collect_files lib/wp2txt/utils.rb /^ def collect_files(str, regex = nil)$/;" f
|
18
|
+
command_exist? lib/wp2txt.rb /^ def command_exist?(command)$/;" f class:Wp2txt.Splitter.file_size
|
19
|
+
convert_characters! lib/wp2txt/utils.rb /^ def convert_characters!(text, has_retried = false)$/;" f class:Wp2txt
|
20
|
+
correct_inline_template! lib/wp2txt/utils.rb /^ def correct_inline_template!(str)$/;" f
|
21
|
+
correct_separator lib/wp2txt/utils.rb /^ def correct_separator(input)$/;" f
|
22
|
+
create_element lib/wp2txt/article.rb /^ def create_element(tp, text)$/;" f class:Wp2txt.Article
|
23
|
+
escape_nowiki! lib/wp2txt/utils.rb /^ def escape_nowiki!(str)$/;" f
|
24
|
+
extract_text lib/wp2txt.rb /^ def extract_text(&block)$/;" f class:Wp2txt.Splitter.file_size.Runner.fill_buffer
|
25
|
+
file_mod lib/wp2txt/utils.rb /^ def file_mod(file_path, backup = false, &block)$/;" f
|
26
|
+
file_size lib/wp2txt.rb /^ def file_size(file)$/;" f class:Wp2txt.Splitter
|
27
|
+
fill_buffer lib/wp2txt.rb /^ def fill_buffer$/;" f class:Wp2txt.Splitter.file_size
|
28
|
+
fill_buffer lib/wp2txt.rb /^ def fill_buffer$/;" f class:Wp2txt.Splitter.file_size.Runner
|
29
|
+
format_wiki! lib/wp2txt/utils.rb /^ def format_wiki!(text, has_retried = false)$/;" f
|
30
|
+
get_newline lib/wp2txt.rb /^ def get_newline$/;" f class:Wp2txt.Splitter.file_size.Runner.fill_buffer
|
31
|
+
get_newline lib/wp2txt.rb /^ def get_newline$/;" f class:Wp2txt.Splitter.file_size.fill_buffer
|
32
|
+
get_page lib/wp2txt.rb /^ def get_page$/;" f class:Wp2txt.Splitter.file_size.Runner.fill_buffer
|
33
|
+
initialize lib/wp2txt.rb /^ def initialize(input_file, output_dir = ".", strip_tmarker = false, del_interfile = true)$/;" f class:Wp2txt.Splitter.file_size.Runner
|
34
|
+
initialize lib/wp2txt.rb /^ def initialize(input_file, output_dir = ".", tfile_size = 10, bz2_gem = false)$/;" f class:Wp2txt.Splitter
|
35
|
+
initialize lib/wp2txt/article.rb /^ def initialize(text, title = "", strip_tmarker = false)$/;" f class:Wp2txt.Article
|
36
|
+
make_reference! lib/wp2txt/utils.rb /^ def make_reference!(str)$/;" f
|
37
|
+
mndash! lib/wp2txt/utils.rb /^ def mndash!(str)$/;" f
|
38
|
+
parse lib/wp2txt/article.rb /^ def parse(source)$/;" f class:Wp2txt.Article
|
39
|
+
prepare lib/wp2txt.rb /^ def prepare$/;" f class:Wp2txt.Splitter.file_size
|
40
|
+
prepare lib/wp2txt.rb /^ def prepare$/;" f class:Wp2txt.Splitter.file_size.Runner
|
41
|
+
process_external_links! lib/wp2txt/utils.rb /^ def process_external_links!(str)$/;" f
|
42
|
+
process_interwiki_links! lib/wp2txt/utils.rb /^ def process_interwiki_links!(str)$/;" f
|
43
|
+
process_nested_structure lib/wp2txt/utils.rb /^ def process_nested_structure(scanner, left, right, &block)$/;" f
|
44
|
+
remove_complex! lib/wp2txt/utils.rb /^ def remove_complex!(str)$/;" f
|
45
|
+
remove_directive! lib/wp2txt/utils.rb /^ def remove_directive!(str)$/;" f
|
46
|
+
remove_emphasis! lib/wp2txt/utils.rb /^ def remove_emphasis!(str)$/;" f
|
47
|
+
remove_hr! lib/wp2txt/utils.rb /^ def remove_hr!(str)$/;" f
|
48
|
+
remove_html! lib/wp2txt/utils.rb /^ def remove_html!(str)$/;" f
|
49
|
+
remove_inbetween! lib/wp2txt/utils.rb /^ def remove_inbetween!(str, tagset = ['<', '>'])$/;" f
|
50
|
+
remove_ref! lib/wp2txt/utils.rb /^ def remove_ref!(str)$/;" f
|
51
|
+
remove_table! lib/wp2txt/utils.rb /^ def remove_table!(str)$/;" f
|
52
|
+
remove_tag! lib/wp2txt/utils.rb /^ def remove_tag!(str)$/;" f
|
53
|
+
remove_templates! lib/wp2txt/utils.rb /^ def remove_templates!(str)$/;" f
|
54
|
+
rename lib/wp2txt/utils.rb /^ def rename(files, ext = "txt")$/;" f
|
55
|
+
sec_to_str lib/wp2txt/utils.rb /^ def sec_to_str(int)$/;" f
|
56
|
+
special_chr! lib/wp2txt/utils.rb /^ def special_chr!(str)$/;" f
|
57
|
+
split_file lib/wp2txt.rb /^ def split_file$/;" f class:Wp2txt.Splitter.file_size.fill_buffer
|
58
|
+
unescape_nowiki! lib/wp2txt/utils.rb /^ def unescape_nowiki!(str)$/;" f
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wp2txt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-11-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -140,6 +140,7 @@ files:
|
|
140
140
|
- lib/wp2txt/version.rb
|
141
141
|
- spec/spec_helper.rb
|
142
142
|
- spec/utils_spec.rb
|
143
|
+
- tags
|
143
144
|
- wp2txt.gemspec
|
144
145
|
homepage: https://github.com/yohasebe/wp2txt
|
145
146
|
licenses: []
|
@@ -159,7 +160,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
159
160
|
- !ruby/object:Gem::Version
|
160
161
|
version: '0'
|
161
162
|
requirements: []
|
162
|
-
rubygems_version: 3.3.
|
163
|
+
rubygems_version: 3.3.3
|
163
164
|
signing_key:
|
164
165
|
specification_version: 4
|
165
166
|
summary: A command-line toolkit to extract text content and category data from Wikipedia
|