wp2txt 0.9.4 → 0.9.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +34 -17
- data/bin/wp2txt +7 -6
- data/data/output_samples/testdata_en.txt +11923 -36921
- data/data/output_samples/testdata_en_categories.txt +107 -182
- data/data/output_samples/testdata_en_summary.txt +1368 -0
- data/data/output_samples/testdata_ja.txt +24812 -4686
- data/data/output_samples/testdata_ja_categories.txt +202 -44
- data/data/output_samples/testdata_ja_summary.txt +1684 -0
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/lib/wp2txt/article.rb +3 -2
- data/lib/wp2txt/utils.rb +51 -27
- data/lib/wp2txt/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bf8270b3488c0045a067f71c155db8d9ac6366a94d825eed9bc6d05c95598345
|
4
|
+
data.tar.gz: 8802949a232c60d8b5ae6f93726154f7a6b40436b478919657f58f4bdc54add3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0a10804d78c33e035aaf429dd4613f84f3db0c6f22a6c36617a1fda25f03c0fd8fac224ec9e6009ab6ddddb475d73e6eda4c21606f89ef94950bc3749ce4f452
|
7
|
+
data.tar.gz: 4a8ea2f0900c6f97d3dcaf6c6387b3543d23962ea064cf1b18a8c293b4664c16fcabca9230aa83b0f5685eacffed74968fe82529566080b7821fd944c7bf275d
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -4,16 +4,18 @@ Wikipedia dump file to text converter that extracts both content and category da
|
|
4
4
|
|
5
5
|
## About
|
6
6
|
|
7
|
-
WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compressed with Bzip2)
|
7
|
+
WP2TXT extracts plain text data from a Wikipedia dump file (encoded in XML / compressed with Bzip2), removing all MediaWiki markup and other metadata. It was developed for researchers who want easy access to open-source multilingual corpora, but may be used for other purposes as well.
|
8
|
+
|
9
|
+
**UPDATE (July 2022)**: Version 0.9.3 adds a new option `category_only`. When this option is enabled, wp2txt will extract only the title and category information of the article. See output examples below.
|
8
10
|
|
9
|
-
**UPDATE (July 2022)**: Version 0.9.3 has added a new option `category_only`. With this option enabled, wp2txt extracts article title and category info only. Please see output examples below.
|
10
11
|
|
11
12
|
## Features
|
12
13
|
|
13
|
-
*
|
14
|
-
*
|
15
|
-
*
|
16
|
-
*
|
14
|
+
* Converts Wikipedia dump files in various languages
|
15
|
+
* Creates output files of specified size
|
16
|
+
* Can specify text elements to be extracted and converted (page titles, section titles, lists, tables)
|
17
|
+
* Can extract category information for each article
|
18
|
+
|
17
19
|
|
18
20
|
## Installation
|
19
21
|
|
@@ -23,11 +25,11 @@ WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compres
|
|
23
25
|
|
24
26
|
Obtain a Wikipedia dump file (from [here](http://dumps.wikimedia.org/backup-index.html)) with a file name such as:
|
25
27
|
|
26
|
-
|
28
|
+
> `xxwiki-yyyymmdd-pages-articles.xml.bz2`
|
27
29
|
|
28
30
|
where `xx` is language code such as "en (English)" or "ja (Japanese)", and `yyyymmdd` is the date of creation (e.g. 20220720).
|
29
31
|
|
30
|
-
### Example 1
|
32
|
+
### Example 1: Basic
|
31
33
|
|
32
34
|
The following extracts text data, including list items and excluding tables.
|
33
35
|
|
@@ -36,15 +38,29 @@ The following extracts text data, including list items and excluding tables.
|
|
36
38
|
- [Output example (English)](https://raw.githubusercontent.com/yohasebe/wp2txt/master/data/output_samples/testdata_en.txt)
|
37
39
|
- [Output example (Japanese)](https://raw.githubusercontent.com/yohasebe/wp2txt/master/data/output_samples/testdata_ja.txt)
|
38
40
|
|
39
|
-
### Example 2
|
41
|
+
### Example 2: Title and category information only
|
40
42
|
|
41
43
|
The following will extract only article titles and the categories to which each article belongs:
|
42
44
|
|
43
45
|
$ wp2txt --category-only -i xxwiki-yyyymmdd-pages-articles.xml.bz2 -o /output_dir
|
44
46
|
|
47
|
+
Each line of the output data contains the title and the categories of an article:
|
48
|
+
|
49
|
+
> title `TAB` category1`,` category2`,` category3`,` ...
|
50
|
+
|
45
51
|
- [Output example (English)](https://raw.githubusercontent.com/yohasebe/wp2txt/master/data/output_samples/testdata_en_categories.txt)
|
46
52
|
- [Output example (Japanese)](https://raw.githubusercontent.com/yohasebe/wp2txt/master/data/output_samples/testdata_ja_categories.txt)
|
47
53
|
|
54
|
+
### Example 3: Title, category, and summary text only
|
55
|
+
|
56
|
+
The following will extract only article titles, the categories to which each article belongs, and text blocks before the first heading of the article:
|
57
|
+
|
58
|
+
$ wp2txt --summary-only -i xxwiki-yyyymmdd-pages-articles.xml.bz2 -o /output_dir
|
59
|
+
|
60
|
+
- [Output example (English)](https://raw.githubusercontent.com/yohasebe/wp2txt/master/data/output_samples/testdata_en_summary.txt)
|
61
|
+
- [Output example (Japanese)](https://raw.githubusercontent.com/yohasebe/wp2txt/master/data/output_samples/testdata_ja_summary.txt)
|
62
|
+
|
63
|
+
|
48
64
|
## Options
|
49
65
|
|
50
66
|
Command line options are as follows:
|
@@ -69,6 +85,7 @@ Command line options are as follows:
|
|
69
85
|
definitions, etc. (Default: true)
|
70
86
|
--category, -g: Show article category information (default: true)
|
71
87
|
--category-only, -y: Extract only article title and categories (default: false)
|
88
|
+
-s, --summary-only: Extract only article title, categories, and summary text before first heading
|
72
89
|
--file-size, -f <i>: Approximate size (in MB) of each output file
|
73
90
|
(default: 10)
|
74
91
|
-u, --num-threads=<i>: Number of threads to be spawned (capped to the number of CPU cores;
|
@@ -78,26 +95,26 @@ Command line options are as follows:
|
|
78
95
|
|
79
96
|
## Caveats
|
80
97
|
|
81
|
-
*
|
82
|
-
*
|
83
|
-
*
|
84
|
-
*
|
98
|
+
* Some data, such as mathematical formulas and computer source code, will not be converted correctly.
|
99
|
+
* Some text data may not be extracted correctly for various reasons (incorrect matching of begin/end tags, language-specific formatting rules, etc.).
|
100
|
+
* The conversion process can take longer than expected. When dealing with a huge data set such as the English Wikipedia on a low-spec environment, it can take several hours or more.
|
101
|
+
* WP2TXT, by the nature of its task, requires a lot of machine power and consumes a large amount of memory/storage resources. Therefore, there is a possibility that the process may stop unexpectedly. In the worst case, the process may even freeze without terminating successfully. Please understand this and use at your own risk.
|
85
102
|
|
86
|
-
|
103
|
+
## Useful Links
|
87
104
|
|
88
105
|
* [Wikipedia Database backup dumps](http://dumps.wikimedia.org/backup-index.html)
|
89
106
|
|
90
|
-
|
107
|
+
## Author
|
91
108
|
|
92
109
|
* Yoichiro Hasebe (<yohasebe@gmail.com>)
|
93
110
|
|
94
|
-
|
111
|
+
## References
|
95
112
|
|
96
113
|
The author will appreciate your mentioning one of these in your research.
|
97
114
|
|
98
115
|
* Yoichiro HASEBE. 2006. [Method for using Wikipedia as Japanese corpus.](http://ci.nii.ac.jp/naid/110006226727) _Doshisha Studies in Language and Culture_ 9(2), 373-403.
|
99
116
|
* 長谷部陽一郎. 2006. [Wikipedia日本語版をコーパスとして用いた言語研究の手法](http://ci.nii.ac.jp/naid/110006226727). 『言語文化』9(2), 373-403.
|
100
117
|
|
101
|
-
|
118
|
+
## License
|
102
119
|
|
103
120
|
This software is distributed under the MIT License. Please see the LICENSE file.
|
data/bin/wp2txt
CHANGED
@@ -27,7 +27,7 @@ EOS
|
|
27
27
|
opt :input_file, "Wikipedia dump file with .bz2 (compressed) or .txt (uncompressed) format", :required => true
|
28
28
|
opt :output_dir, "Output directory", :default => Dir::pwd, :type => String
|
29
29
|
opt :convert, "Output in plain text (converting from XML)", :default => true
|
30
|
-
opt :list, "Show list items in output", :default =>
|
30
|
+
opt :list, "Show list items in output", :default => false
|
31
31
|
opt :heading, "Show section titles in output", :default => true, :short => "-d"
|
32
32
|
opt :title, "Show page titles in output", :default => true
|
33
33
|
opt :table, "Show table source code in output", :default => false
|
@@ -38,6 +38,7 @@ EOS
|
|
38
38
|
opt :marker, "Show symbols prefixed to list items, definitions, etc.", :default => true
|
39
39
|
opt :category, "Show article category information", :default => true
|
40
40
|
opt :category_only, "Extract only article title and categories", :default => false
|
41
|
+
opt :summary_only, "Extract only article title, categories, and summary text before first heading", :default => false
|
41
42
|
opt :file_size, "Approximate size (in MB) of each output file", :default => 10
|
42
43
|
opt :num_threads, "Number of threads to be spawned (capped to the number of CPU cores; set 99 to spawn max num of threads)", :default => 4
|
43
44
|
end
|
@@ -50,10 +51,9 @@ tfile_size = opts[:file_size]
|
|
50
51
|
num_threads = opts[:num_threads]
|
51
52
|
convert = opts[:convert]
|
52
53
|
strip_tmarker = opts[:marker] ? false : true
|
53
|
-
opt_array = [:title, :list, :heading, :table, :redirect, :multiline]
|
54
|
+
opt_array = [:title, :list, :heading, :table, :redirect, :multiline, :category, :category_only, :summary_only]
|
54
55
|
$leave_inline_template = true if opts[:inline]
|
55
56
|
$leave_ref = true if opts[:ref]
|
56
|
-
# $leave_table = true if opts[:table]
|
57
57
|
config = {}
|
58
58
|
opt_array.each do |opt|
|
59
59
|
config[opt] = opts[opt]
|
@@ -65,11 +65,11 @@ wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, num_thre
|
|
65
65
|
wpconv.extract_text do |article|
|
66
66
|
format_wiki!(article.title)
|
67
67
|
|
68
|
-
if
|
68
|
+
if config[:category_only]
|
69
69
|
title = "#{article.title}\t"
|
70
70
|
contents = article.categories.join(", ")
|
71
71
|
contents << "\n"
|
72
|
-
elsif
|
72
|
+
elsif config[:category] && !article.categories.empty?
|
73
73
|
title = "\n[[#{article.title}]]\n\n"
|
74
74
|
contents = "\nCATEGORIES: "
|
75
75
|
contents << article.categories.join(", ")
|
@@ -79,10 +79,11 @@ wpconv.extract_text do |article|
|
|
79
79
|
contents = ""
|
80
80
|
end
|
81
81
|
|
82
|
-
unless
|
82
|
+
unless config[:category_only]
|
83
83
|
article.elements.each do |e|
|
84
84
|
case e.first
|
85
85
|
when :mw_heading
|
86
|
+
break if config[:summary_only]
|
86
87
|
next if !config[:heading]
|
87
88
|
format_wiki!(e.last)
|
88
89
|
line = e.last
|