wp2txt 1.0.1 → 1.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +17 -11
- data/lib/wp2txt/utils.rb +22 -21
- data/lib/wp2txt/version.rb +1 -1
- data/tags +58 -0
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bb540f4f17f7825786d110245c235ac556e3e64cedb17efae3e0591887425801
|
4
|
+
data.tar.gz: 479c357f7ba117ae10d9a5a04d24ce3aca2e54d942a156b02eb932c1aab55c8b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 940d47d2c8bce06029fe76e3b3744563d089e26e297e5224b36e65d815295da57117eae84cbb43abeddf2f2c052e2a987d668cba52c7af6148e935b571b6d403
|
7
|
+
data.tar.gz: 8ce76523a3bf181ac7a5da11f088dd14cfb1e1d7ac0d5239832db52968d183db16a3ece6074513b634eebe0e5ca28ceea945eaef6542ecb1933266caf4e89a3c
|
data/README.md
CHANGED
@@ -6,20 +6,26 @@ A command-line toolkit to extract text content and category data from Wikipedia
|
|
6
6
|
|
7
7
|
WP2TXT extracts text and category data from Wikipedia dump files (encoded in XML / compressed with Bzip2), removing MediaWiki markup and other metadata.
|
8
8
|
|
9
|
-
|
9
|
+
## Changelog
|
10
10
|
|
11
|
-
|
12
|
-
|
13
|
-
|
11
|
+
**November 2022**
|
12
|
+
|
13
|
+
- Code added to suppress "Invalid byte sequence error" when an ilegal UTF-8 character is input.
|
14
|
+
|
15
|
+
**August 2022**
|
16
|
+
|
17
|
+
- A new option `--category-only` has been added. When this option is enabled, only the title and category information of the article is extracted.
|
18
|
+
- A new option `--summary-only` has been added. If this option is enabled, only the title, category information, and opening paragraphs of the article will be extracted.
|
19
|
+
- Text conversion with the current version of WP2TXT is *more than 2x times faster* than the previous version due to parallel processing of multiple files (the rate of speedup depends on the CPU cores used for processing).
|
14
20
|
|
15
21
|
## Screenshot
|
16
22
|
|
17
|
-
<img src='https://raw.githubusercontent.com/yohasebe/wp2txt/master/image/screenshot.png' width="
|
23
|
+
<img src='https://raw.githubusercontent.com/yohasebe/wp2txt/master/image/screenshot.png' width="800" />
|
18
24
|
|
19
|
-
**Environment**
|
25
|
+
**Environment**
|
20
26
|
|
21
27
|
- WP2TXT 1.0.1
|
22
|
-
- MacBook Pro (2021 Apple M1 Pro)
|
28
|
+
- MacBook Pro (2021 Apple M1 Pro)
|
23
29
|
- enwiki-20220720-pages-articles.xml.bz2 (19.98 GB)
|
24
30
|
|
25
31
|
In the above environment, the process (decompression, splitting, extraction, and conversion) to obtain the plain text data of the English Wikipedia takes less than 1.5 hours.
|
@@ -34,7 +40,7 @@ In the above environment, the process (decompression, splitting, extraction, and
|
|
34
40
|
|
35
41
|
## Preparation
|
36
42
|
|
37
|
-
### For MacOS
|
43
|
+
### For MacOS and Linux
|
38
44
|
|
39
45
|
WP2TXT requires that one of the following commands be installed on the system in order to decompress `bz2` files:
|
40
46
|
|
@@ -184,11 +190,11 @@ The author will appreciate your mentioning one of these in your research.
|
|
184
190
|
Or use this BibTeX entry:
|
185
191
|
|
186
192
|
```
|
187
|
-
@misc{
|
193
|
+
@misc{wp2txt_2022,
|
188
194
|
author = {Yoichiro Hasebe},
|
189
195
|
title = {WP2TXT: A command-line toolkit to extract text content and category data from Wikipedia dump files},
|
190
|
-
url = {https://github.com/yohasebe/wp2txt}
|
191
|
-
year = {2022}
|
196
|
+
url = {https://github.com/yohasebe/wp2txt},
|
197
|
+
year = {2022}
|
192
198
|
}
|
193
199
|
```
|
194
200
|
|
data/lib/wp2txt/utils.rb
CHANGED
@@ -41,7 +41,7 @@ $in_table_regex2 = Regexp.new('^\|\}.*?$')
|
|
41
41
|
$in_unordered_regex = Regexp.new('^\*')
|
42
42
|
$in_ordered_regex = Regexp.new('^\#')
|
43
43
|
$in_pre_regex = Regexp.new('^ ')
|
44
|
-
$in_definition_regex = Regexp.new('^[\;\:]')
|
44
|
+
$in_definition_regex = Regexp.new('^[\;\:]')
|
45
45
|
$blank_line_regex = Regexp.new('^\s*$')
|
46
46
|
$redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
|
47
47
|
$remove_tag_regex = Regexp.new("\<[^\<\>]*\>")
|
@@ -98,11 +98,12 @@ $cleanup_regex_08 = Regexp.new('\n\n\n+', Regexp::MULTILINE)
|
|
98
98
|
module Wp2txt
|
99
99
|
|
100
100
|
def convert_characters!(text, has_retried = false)
|
101
|
-
begin
|
102
|
-
text << ""
|
101
|
+
begin
|
102
|
+
text << ""
|
103
103
|
chrref_to_utf!(text)
|
104
104
|
special_chr!(text)
|
105
|
-
|
105
|
+
text.encode!("UTF-8", "UTF-8", invalid: :replace, replace: "")
|
106
|
+
|
106
107
|
rescue # detect invalid byte sequence in UTF-8
|
107
108
|
if has_retried
|
108
109
|
puts "invalid byte sequence detected"
|
@@ -112,20 +113,20 @@ module Wp2txt
|
|
112
113
|
end
|
113
114
|
exit
|
114
115
|
else
|
115
|
-
text.encode!("UTF-16")
|
116
|
-
text.encode!("UTF-
|
116
|
+
text.encode!("UTF-16", "UTF-16", invalid: :replace, replace: "")
|
117
|
+
text.encode!("UTF-16", "UTF-16", invalid: :replace, replace: "")
|
117
118
|
convert_characters!(text, true)
|
118
119
|
end
|
119
120
|
end
|
120
121
|
end
|
121
|
-
|
122
|
+
|
122
123
|
def format_wiki!(text, has_retried = false)
|
123
124
|
remove_complex!(text)
|
124
125
|
|
125
126
|
escape_nowiki!(text)
|
126
127
|
process_interwiki_links!(text)
|
127
128
|
process_external_links!(text)
|
128
|
-
unescape_nowiki!(text)
|
129
|
+
unescape_nowiki!(text)
|
129
130
|
remove_directive!(text)
|
130
131
|
remove_emphasis!(text)
|
131
132
|
mndash!(text)
|
@@ -135,7 +136,7 @@ module Wp2txt
|
|
135
136
|
remove_templates!(text) unless $leave_inline_template
|
136
137
|
remove_table!(text) unless $leave_table
|
137
138
|
end
|
138
|
-
|
139
|
+
|
139
140
|
def cleanup!(text)
|
140
141
|
text.gsub!($cleanup_regex_01){""}
|
141
142
|
text.gsub!($cleanup_regex_02){""}
|
@@ -150,7 +151,7 @@ module Wp2txt
|
|
150
151
|
end
|
151
152
|
|
152
153
|
#################### parser for nested structure ####################
|
153
|
-
|
154
|
+
|
154
155
|
def process_nested_structure(scanner, left, right, &block)
|
155
156
|
test = false
|
156
157
|
buffer = ""
|
@@ -195,7 +196,7 @@ module Wp2txt
|
|
195
196
|
rescue => e
|
196
197
|
return scanner.string
|
197
198
|
end
|
198
|
-
end
|
199
|
+
end
|
199
200
|
|
200
201
|
#################### methods used from format_wiki ####################
|
201
202
|
def escape_nowiki!(str)
|
@@ -218,11 +219,11 @@ module Wp2txt
|
|
218
219
|
@nowikis[obj_id]
|
219
220
|
end
|
220
221
|
end
|
221
|
-
|
222
|
+
|
222
223
|
def process_interwiki_links!(str)
|
223
224
|
scanner = StringScanner.new(str)
|
224
225
|
result = process_nested_structure(scanner, "[[", "]]") do |contents|
|
225
|
-
parts = contents.split("|")
|
226
|
+
parts = contents.split("|")
|
226
227
|
case parts.size
|
227
228
|
when 1
|
228
229
|
parts.first || ""
|
@@ -265,7 +266,7 @@ module Wp2txt
|
|
265
266
|
end
|
266
267
|
str.replace(result)
|
267
268
|
end
|
268
|
-
|
269
|
+
|
269
270
|
def remove_table!(str)
|
270
271
|
scanner = StringScanner.new(str)
|
271
272
|
result = process_nested_structure(scanner, "{|", "|}") do |contents|
|
@@ -273,7 +274,7 @@ module Wp2txt
|
|
273
274
|
end
|
274
275
|
str.replace(result)
|
275
276
|
end
|
276
|
-
|
277
|
+
|
277
278
|
def special_chr!(str)
|
278
279
|
str.replace $html_decoder.decode(str)
|
279
280
|
end
|
@@ -316,7 +317,7 @@ module Wp2txt
|
|
316
317
|
end
|
317
318
|
return true
|
318
319
|
end
|
319
|
-
|
320
|
+
|
320
321
|
def mndash!(str)
|
321
322
|
str.gsub!($mndash_regex, "–")
|
322
323
|
end
|
@@ -347,7 +348,7 @@ module Wp2txt
|
|
347
348
|
str.gsub!($complex_regex_04){""}
|
348
349
|
str.gsub!($complex_regex_05){""}
|
349
350
|
end
|
350
|
-
|
351
|
+
|
351
352
|
def make_reference!(str)
|
352
353
|
str.gsub!($make_reference_regex_a){"\n"}
|
353
354
|
str.gsub!($make_reference_regex_b){""}
|
@@ -413,7 +414,7 @@ module Wp2txt
|
|
413
414
|
File.rename(file_path, file_path + ".bak")
|
414
415
|
File.rename("temp", file_path)
|
415
416
|
File.unlink(file_path + ".bak") unless backup
|
416
|
-
end
|
417
|
+
end
|
417
418
|
|
418
419
|
# modify files under a directry (recursive)
|
419
420
|
def batch_file_mod(dir_path, &block)
|
@@ -421,7 +422,7 @@ module Wp2txt
|
|
421
422
|
collect_files(dir_path).each do |file|
|
422
423
|
yield file if FileTest.file?(file)
|
423
424
|
end
|
424
|
-
else
|
425
|
+
else
|
425
426
|
yield dir_path if FileTest.file?(dir_path)
|
426
427
|
end
|
427
428
|
end
|
@@ -445,9 +446,9 @@ module Wp2txt
|
|
445
446
|
end
|
446
447
|
end
|
447
448
|
|
448
|
-
def rename(files, ext = "txt")
|
449
|
+
def rename(files, ext = "txt")
|
449
450
|
# num of digits necessary to name the last file generated
|
450
|
-
maxwidth = 0
|
451
|
+
maxwidth = 0
|
451
452
|
|
452
453
|
files.each do |f|
|
453
454
|
width = f.slice(/\-(\d+)\z/, 1).to_s.length.to_i
|
data/lib/wp2txt/version.rb
CHANGED
data/tags
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
!_TAG_FILE_FORMAT 2 /extended format; --format=1 will not append ;" to lines/
|
2
|
+
!_TAG_FILE_SORTED 1 /0=unsorted, 1=sorted, 2=foldcase/
|
3
|
+
!_TAG_PROGRAM_AUTHOR Darren Hiebert /dhiebert@users.sourceforge.net/
|
4
|
+
!_TAG_PROGRAM_NAME Exuberant Ctags //
|
5
|
+
!_TAG_PROGRAM_URL http://ctags.sourceforge.net /official site/
|
6
|
+
!_TAG_PROGRAM_VERSION 5.8 //
|
7
|
+
Article lib/wp2txt/article.rb /^ class Article$/;" c class:Wp2txt
|
8
|
+
Runner lib/wp2txt.rb /^ class Runner$/;" c class:Wp2txt.Splitter.file_size
|
9
|
+
Splitter lib/wp2txt.rb /^ class Splitter$/;" c class:Wp2txt
|
10
|
+
Wp2txt lib/wp2txt.rb /^module Wp2txt$/;" m
|
11
|
+
Wp2txt lib/wp2txt/article.rb /^module Wp2txt$/;" m
|
12
|
+
Wp2txt lib/wp2txt/utils.rb /^module Wp2txt$/;" m
|
13
|
+
Wp2txt lib/wp2txt/version.rb /^module Wp2txt$/;" m
|
14
|
+
batch_file_mod lib/wp2txt/utils.rb /^ def batch_file_mod(dir_path, &block)$/;" f
|
15
|
+
chrref_to_utf! lib/wp2txt/utils.rb /^ def chrref_to_utf!(num_str)$/;" f
|
16
|
+
cleanup! lib/wp2txt/utils.rb /^ def cleanup!(text)$/;" f
|
17
|
+
collect_files lib/wp2txt/utils.rb /^ def collect_files(str, regex = nil)$/;" f
|
18
|
+
command_exist? lib/wp2txt.rb /^ def command_exist?(command)$/;" f class:Wp2txt.Splitter.file_size
|
19
|
+
convert_characters! lib/wp2txt/utils.rb /^ def convert_characters!(text, has_retried = false)$/;" f class:Wp2txt
|
20
|
+
correct_inline_template! lib/wp2txt/utils.rb /^ def correct_inline_template!(str)$/;" f
|
21
|
+
correct_separator lib/wp2txt/utils.rb /^ def correct_separator(input)$/;" f
|
22
|
+
create_element lib/wp2txt/article.rb /^ def create_element(tp, text)$/;" f class:Wp2txt.Article
|
23
|
+
escape_nowiki! lib/wp2txt/utils.rb /^ def escape_nowiki!(str)$/;" f
|
24
|
+
extract_text lib/wp2txt.rb /^ def extract_text(&block)$/;" f class:Wp2txt.Splitter.file_size.Runner.fill_buffer
|
25
|
+
file_mod lib/wp2txt/utils.rb /^ def file_mod(file_path, backup = false, &block)$/;" f
|
26
|
+
file_size lib/wp2txt.rb /^ def file_size(file)$/;" f class:Wp2txt.Splitter
|
27
|
+
fill_buffer lib/wp2txt.rb /^ def fill_buffer$/;" f class:Wp2txt.Splitter.file_size
|
28
|
+
fill_buffer lib/wp2txt.rb /^ def fill_buffer$/;" f class:Wp2txt.Splitter.file_size.Runner
|
29
|
+
format_wiki! lib/wp2txt/utils.rb /^ def format_wiki!(text, has_retried = false)$/;" f
|
30
|
+
get_newline lib/wp2txt.rb /^ def get_newline$/;" f class:Wp2txt.Splitter.file_size.Runner.fill_buffer
|
31
|
+
get_newline lib/wp2txt.rb /^ def get_newline$/;" f class:Wp2txt.Splitter.file_size.fill_buffer
|
32
|
+
get_page lib/wp2txt.rb /^ def get_page$/;" f class:Wp2txt.Splitter.file_size.Runner.fill_buffer
|
33
|
+
initialize lib/wp2txt.rb /^ def initialize(input_file, output_dir = ".", strip_tmarker = false, del_interfile = true)$/;" f class:Wp2txt.Splitter.file_size.Runner
|
34
|
+
initialize lib/wp2txt.rb /^ def initialize(input_file, output_dir = ".", tfile_size = 10, bz2_gem = false)$/;" f class:Wp2txt.Splitter
|
35
|
+
initialize lib/wp2txt/article.rb /^ def initialize(text, title = "", strip_tmarker = false)$/;" f class:Wp2txt.Article
|
36
|
+
make_reference! lib/wp2txt/utils.rb /^ def make_reference!(str)$/;" f
|
37
|
+
mndash! lib/wp2txt/utils.rb /^ def mndash!(str)$/;" f
|
38
|
+
parse lib/wp2txt/article.rb /^ def parse(source)$/;" f class:Wp2txt.Article
|
39
|
+
prepare lib/wp2txt.rb /^ def prepare$/;" f class:Wp2txt.Splitter.file_size
|
40
|
+
prepare lib/wp2txt.rb /^ def prepare$/;" f class:Wp2txt.Splitter.file_size.Runner
|
41
|
+
process_external_links! lib/wp2txt/utils.rb /^ def process_external_links!(str)$/;" f
|
42
|
+
process_interwiki_links! lib/wp2txt/utils.rb /^ def process_interwiki_links!(str)$/;" f
|
43
|
+
process_nested_structure lib/wp2txt/utils.rb /^ def process_nested_structure(scanner, left, right, &block)$/;" f
|
44
|
+
remove_complex! lib/wp2txt/utils.rb /^ def remove_complex!(str)$/;" f
|
45
|
+
remove_directive! lib/wp2txt/utils.rb /^ def remove_directive!(str)$/;" f
|
46
|
+
remove_emphasis! lib/wp2txt/utils.rb /^ def remove_emphasis!(str)$/;" f
|
47
|
+
remove_hr! lib/wp2txt/utils.rb /^ def remove_hr!(str)$/;" f
|
48
|
+
remove_html! lib/wp2txt/utils.rb /^ def remove_html!(str)$/;" f
|
49
|
+
remove_inbetween! lib/wp2txt/utils.rb /^ def remove_inbetween!(str, tagset = ['<', '>'])$/;" f
|
50
|
+
remove_ref! lib/wp2txt/utils.rb /^ def remove_ref!(str)$/;" f
|
51
|
+
remove_table! lib/wp2txt/utils.rb /^ def remove_table!(str)$/;" f
|
52
|
+
remove_tag! lib/wp2txt/utils.rb /^ def remove_tag!(str)$/;" f
|
53
|
+
remove_templates! lib/wp2txt/utils.rb /^ def remove_templates!(str)$/;" f
|
54
|
+
rename lib/wp2txt/utils.rb /^ def rename(files, ext = "txt")$/;" f
|
55
|
+
sec_to_str lib/wp2txt/utils.rb /^ def sec_to_str(int)$/;" f
|
56
|
+
special_chr! lib/wp2txt/utils.rb /^ def special_chr!(str)$/;" f
|
57
|
+
split_file lib/wp2txt.rb /^ def split_file$/;" f class:Wp2txt.Splitter.file_size.fill_buffer
|
58
|
+
unescape_nowiki! lib/wp2txt/utils.rb /^ def unescape_nowiki!(str)$/;" f
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wp2txt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-11-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -140,6 +140,7 @@ files:
|
|
140
140
|
- lib/wp2txt/version.rb
|
141
141
|
- spec/spec_helper.rb
|
142
142
|
- spec/utils_spec.rb
|
143
|
+
- tags
|
143
144
|
- wp2txt.gemspec
|
144
145
|
homepage: https://github.com/yohasebe/wp2txt
|
145
146
|
licenses: []
|
@@ -159,7 +160,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
159
160
|
- !ruby/object:Gem::Version
|
160
161
|
version: '0'
|
161
162
|
requirements: []
|
162
|
-
rubygems_version: 3.3.
|
163
|
+
rubygems_version: 3.3.3
|
163
164
|
signing_key:
|
164
165
|
specification_version: 4
|
165
166
|
summary: A command-line toolkit to extract text content and category data from Wikipedia
|