wp2txt 0.5.4 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -1
- data/README.md +32 -23
- data/bin/wp2txt +23 -23
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +7 -8
- data/spec/utils_spec.rb +42 -40
- data/wp2txt.gemspec +5 -5
- metadata +18 -32
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c05af7e5c72b073f18b53eca8619212f0928aaa1
|
4
|
+
data.tar.gz: 02dc116458041b096fd811b3fec6ffb6e6ff3ee7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 26479a43e8e3ccebfb6578562d62b7b6d2234e924838774a11d143c8fad7e1952db0278526d202a77aad680926b96baa25c8da9e0c23f916d95ccf614d5e82a3
|
7
|
+
data.tar.gz: 8959c9b51efb18386cf556c67439332fc43723d8de8abfc014745d0d0ad1f89ba56b6222b6f8d2a33748343e2dbef93752ee104c8a54c57e01fe21bee7e2f892
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -2,8 +2,6 @@
|
|
2
2
|
|
3
3
|
Wikipedia dump file to text converter
|
4
4
|
|
5
|
-
CAUTION: This software is on an experimental stage. Use with care!
|
6
|
-
|
7
5
|
### About ###
|
8
6
|
|
9
7
|
WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata. It is originally intended to be useful for researchers who look for an easy way to obtain open-source multi-lingual corpora, but may be handy for other purposes.
|
@@ -14,15 +12,19 @@ WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compres
|
|
14
12
|
* Create output files of specified size.
|
15
13
|
* Allow users to specify text elements to be extracted/converted (page titles, section titles, lists, and tables).
|
16
14
|
|
17
|
-
WP2TXT before version 0.4.0 came with Mac/Windows GUI. Now it's become a pure command-line application--Sorry GUI folks, but there seems more demand for an easy-to-hack CUI package than a not-very-flexible GUI app.
|
18
|
-
|
19
15
|
### Installation
|
20
16
|
|
21
17
|
$ gem install wp2txt
|
22
18
|
|
19
|
+
It is highly recommended you also install bz2-ruby gem. See the following for the details about bz2-ruby gem:
|
20
|
+
|
21
|
+
[https://github.com/brianmario/bzip2-ruby](https://github.com/brianmario/bzip2-ruby)
|
22
|
+
|
23
|
+
When the above gem is not found, wp2txt will try to use bzip2 program in your command line environment. Supposedly he former option is more reliable as well as fast.
|
24
|
+
|
23
25
|
### Usage
|
24
26
|
|
25
|
-
Obtain a Wikipedia dump file (
|
27
|
+
Obtain a Wikipedia dump file (from [here](http://dumps.wikimedia.org/backup-index.html)) with a file name such as:
|
26
28
|
|
27
29
|
xxwiki-yyyymmdd-pages-articles.xml.bz2
|
28
30
|
|
@@ -30,24 +32,31 @@ where `xx` is language code such as "en (English)" or "ja (Japanese)", and `yyy
|
|
30
32
|
|
31
33
|
Command line options are as follows:
|
32
34
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
35
|
+
*CAUTION:* command line options in the current version have been drastically changed from those in versions 0.5!
|
36
|
+
|
37
|
+
Usage: wp2txt [options]
|
38
|
+
where [options] are:
|
39
|
+
--input-file, -i: Wikipedia dump file with .bz2 (compressed) or
|
40
|
+
.txt (uncompressed) format
|
41
|
+
--output-dir, -o <s>: Output directory (default:
|
42
|
+
/Users/yohasebe/Dropbox/code/wp2txt)
|
43
|
+
--convert, --no-convert, -c: Output in plain text (converting from XML)
|
44
|
+
(default: true)
|
45
|
+
--list, --no-list, -l: Show list items in output (default: true)
|
46
|
+
--heading, --no-heading, -d: Show section titles in output (default: true)
|
47
|
+
--title, --no-title, -t: Show page titles in output (default: true)
|
48
|
+
--table, -a: Show table source code in output
|
49
|
+
--template, -e: Show template specifications in output
|
50
|
+
--redirect, -r: Show redirect destination
|
51
|
+
--marker, --no-marker, -m: Show symbols prefixed to list items,
|
52
|
+
definitions, etc. (Default: true)
|
53
|
+
--category, -g: Show article category information
|
54
|
+
--file-size, -f <i>: Approximate size (in MB) of each output file
|
55
|
+
(default: 10)
|
56
|
+
--version, -v: Print version and exit
|
57
|
+
--help, -h: Show this message
|
58
|
+
|
59
|
+
### Caveats ###
|
51
60
|
|
52
61
|
* Certain types of data such as mathematical equations and computer source code are not be properly converted. Please remember this software is originally intended for correcting “sentences” for linguistic studies.
|
53
62
|
* Extraction of normal text data could sometimes fail for various reasons (e.g. illegal matching of begin/end tags, language-specific conventions of formatting, etc).
|
data/bin/wp2txt
CHANGED
@@ -26,15 +26,15 @@ EOS
|
|
26
26
|
|
27
27
|
opt :input_file, "Wikipedia dump file with .bz2 (compressed) or .txt (uncompressed) format", :required => true
|
28
28
|
opt :output_dir, "Output directory", :default => Dir::pwd, :type => String
|
29
|
-
opt :
|
30
|
-
opt :
|
31
|
-
opt :
|
32
|
-
opt :
|
33
|
-
opt :
|
34
|
-
opt :
|
35
|
-
opt :
|
36
|
-
opt :
|
37
|
-
opt :
|
29
|
+
opt :convert, "Output in plain text (converting from XML)", :default => true
|
30
|
+
opt :list, "Show list items in output", :default => true
|
31
|
+
opt :heading, "Show section titles in output", :default => true, :short => "-d"
|
32
|
+
opt :title, "Show page titles in output", :default => true
|
33
|
+
opt :table, "Show table source code in output", :default => false
|
34
|
+
opt :template, "Show template specifications in output", :default => false
|
35
|
+
opt :redirect, "Show redirect destination", :default => false
|
36
|
+
opt :marker, "Show symbols prefixed to list items, definitions, etc.", :default => true
|
37
|
+
opt :category, "Show article category information", :default => false
|
38
38
|
opt :file_size, "Approximate size (in MB) of each output file", :default => 10
|
39
39
|
end
|
40
40
|
Trollop::die :size, "must be larger than 0" unless opts[:file_size] >= 0
|
@@ -43,9 +43,9 @@ Trollop::die :output_dir, "must exist" unless File.exist?(opts[:output_dir])
|
|
43
43
|
input_file = ARGV[0]
|
44
44
|
output_dir = opts[:output_dir]
|
45
45
|
tfile_size = opts[:file_size]
|
46
|
-
|
47
|
-
strip_tmarker = opts[:
|
48
|
-
opt_array = [:title_off, :
|
46
|
+
convert = opts[:convert]
|
47
|
+
strip_tmarker = opts[:marker] ? false : true
|
48
|
+
opt_array = [:title_off, :list, :heading, :table, :template, :redirect]
|
49
49
|
config = {}
|
50
50
|
opt_array.each do |opt|
|
51
51
|
config[opt] = opts[opt]
|
@@ -54,13 +54,13 @@ end
|
|
54
54
|
# a "parent" is either commandline progress bar or
|
55
55
|
# a gui window (not available for now)
|
56
56
|
parent = Wp2txt::CmdProgbar.new
|
57
|
-
wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size,
|
57
|
+
wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker)
|
58
58
|
|
59
59
|
wpconv.extract_text do |article|
|
60
60
|
title = format_wiki article.title
|
61
61
|
title = "[[#{title}]]\n"
|
62
62
|
|
63
|
-
if
|
63
|
+
if opts[:category] && !article.categories.empty?
|
64
64
|
contents = "\nCATEGORIES: "
|
65
65
|
contents += article.categories.join(", ")
|
66
66
|
contents += "\n\n"
|
@@ -71,31 +71,31 @@ wpconv.extract_text do |article|
|
|
71
71
|
article.elements.each do |e|
|
72
72
|
case e.first
|
73
73
|
when :mw_heading
|
74
|
-
next if config[:
|
74
|
+
next if !config[:heading]
|
75
75
|
line = format_wiki(e.last)
|
76
76
|
line += "+HEADING+" if $DEBUG_MODE
|
77
77
|
when :mw_paragraph
|
78
|
-
next if config[:
|
78
|
+
# next if !config[:paragraph]
|
79
79
|
line = format_wiki(e.last)
|
80
80
|
line += "+PARAGRAPH+" if $DEBUG_MODE
|
81
81
|
when :mw_table, :mw_htable
|
82
|
-
next if config[:
|
82
|
+
next if !config[:table]
|
83
83
|
line = format_wiki(e.last)
|
84
84
|
line += "+TABLE+" if $DEBUG_MODE
|
85
85
|
when :mw_pre
|
86
|
-
next if config[:
|
86
|
+
next if !config[:pre]
|
87
87
|
line = e.last
|
88
88
|
line += "+PRE+" if $DEBUG_MODE
|
89
89
|
when :mw_quote
|
90
|
-
next if config[:
|
90
|
+
# next if !config[:quote]
|
91
91
|
line = format_wiki(e.last)
|
92
92
|
line += "+QUOTE+" if $DEBUG_MODE
|
93
93
|
when :mw_unordered, :mw_ordered, :mw_definition
|
94
|
-
next if config[:
|
94
|
+
next if !config[:list]
|
95
95
|
line = format_wiki(e.last)
|
96
96
|
line += "+LIST+" if $DEBUG_MODE
|
97
97
|
when :mw_redirect
|
98
|
-
next if config[:
|
98
|
+
next if !config[:redirect]
|
99
99
|
line = format_wiki(e.last)
|
100
100
|
line += "+REDIRECT+" if $DEBUG_MODE
|
101
101
|
line += "\n\n"
|
@@ -108,14 +108,14 @@ wpconv.extract_text do |article|
|
|
108
108
|
end
|
109
109
|
end
|
110
110
|
contents += line
|
111
|
-
contents = remove_templates(contents)
|
111
|
+
contents = remove_templates(contents) unless config[:template]
|
112
112
|
end
|
113
113
|
|
114
114
|
##### cleanup #####
|
115
115
|
if /\A\s*\z/m =~ contents
|
116
116
|
result = ""
|
117
117
|
else
|
118
|
-
result = config[:
|
118
|
+
result = config[:title] ? title + "\n" + contents : contents
|
119
119
|
end
|
120
120
|
result = result.gsub(/\[ref\]\s*\[\/ref\]/m){""}
|
121
121
|
result = result.gsub(/\n\n\n+/m){"\n\n"} + "\n"
|
data/lib/wp2txt/version.rb
CHANGED
data/lib/wp2txt.rb
CHANGED
@@ -25,16 +25,16 @@ module Wp2txt
|
|
25
25
|
|
26
26
|
include Wp2txt
|
27
27
|
|
28
|
-
# attr_accessor :pause_flag, :stop_flag, :outfiles, :
|
28
|
+
# attr_accessor :pause_flag, :stop_flag, :outfiles, :convert
|
29
29
|
|
30
|
-
def initialize(parent, input_file, output_dir = ".", tfile_size = 10,
|
30
|
+
def initialize(parent, input_file, output_dir = ".", tfile_size = 10, convert = true, strip_tmarker = false)
|
31
31
|
@parent = parent
|
32
32
|
@fp = nil
|
33
33
|
|
34
34
|
@input_file = input_file
|
35
35
|
@output_dir = output_dir
|
36
36
|
@tfile_size = tfile_size
|
37
|
-
@
|
37
|
+
@convert = convert
|
38
38
|
@strip_tmarker = strip_tmarker
|
39
39
|
end
|
40
40
|
|
@@ -213,16 +213,15 @@ module Wp2txt
|
|
213
213
|
# call this method to do the job
|
214
214
|
def extract_text(&block)
|
215
215
|
prepare
|
216
|
-
|
217
|
-
if @convert_off
|
218
|
-
extract
|
219
|
-
# convert xml to plain text
|
220
|
-
else
|
216
|
+
if @convert
|
221
217
|
if block
|
222
218
|
extract_and_convert(&block)
|
223
219
|
else
|
224
220
|
extract_and_convert
|
225
221
|
end
|
222
|
+
else
|
223
|
+
# output the original xml only split to files of the specified size
|
224
|
+
extract
|
226
225
|
end
|
227
226
|
end
|
228
227
|
|
data/spec/utils_spec.rb
CHANGED
@@ -23,7 +23,7 @@ describe "Wp2txt" do
|
|
23
23
|
str_processed = process_nested_structure(scanner, "[[", "]]") do |content|
|
24
24
|
"<<" + content + ">>"
|
25
25
|
end
|
26
|
-
str_processed.
|
26
|
+
expect(str_processed).to eq str_after
|
27
27
|
|
28
28
|
str_before = "#* {{quote-book|1503|year_published=1836|chapter=19 Henry VII. c. 5: Coin||A Collection of Statutes Connected with the General Administration of the Law|page=158|url=http://books.google.com/books?id=QtYuAAAAIAAJ
|
29
29
|
|passage={{...}} every of them, being gold, whole and weight, shall '''go''' and be current in payment throughout this his realm for the sum that they were coined for.}}"
|
@@ -33,7 +33,9 @@ describe "Wp2txt" do
|
|
33
33
|
str_processed = process_nested_structure(scanner, "{{", "}}") do |content|
|
34
34
|
"<<" + content + ">>"
|
35
35
|
end
|
36
|
-
str_processed.should == str_after
|
36
|
+
#str_processed.should == str_after
|
37
|
+
expect(str_processed).to eq str_after
|
38
|
+
|
37
39
|
end
|
38
40
|
end
|
39
41
|
|
@@ -41,7 +43,7 @@ describe "Wp2txt" do
|
|
41
43
|
it "replaces character references with real characters" do
|
42
44
|
str_before = " < > & ""
|
43
45
|
str_after = " < > & \""
|
44
|
-
special_chr(str_before).
|
46
|
+
expect(special_chr(str_before)).to eq str_after
|
45
47
|
end
|
46
48
|
end
|
47
49
|
|
@@ -49,7 +51,7 @@ describe "Wp2txt" do
|
|
49
51
|
it "replaces character references with real characters" do
|
50
52
|
str_before = "♪"
|
51
53
|
str_after = "♪"
|
52
|
-
chrref_to_utf(str_before).
|
54
|
+
expect(chrref_to_utf(str_before)).to eq str_after
|
53
55
|
end
|
54
56
|
end
|
55
57
|
|
@@ -57,7 +59,7 @@ describe "Wp2txt" do
|
|
57
59
|
it "replaces {mdash}, {ndash}, or {–} with '–'" do
|
58
60
|
str_before = "{mdash} {ndash} {–}"
|
59
61
|
str_after = "– – –"
|
60
|
-
mndash(str_before).
|
62
|
+
expect(mndash(str_before)).to eq str_after
|
61
63
|
end
|
62
64
|
end
|
63
65
|
|
@@ -65,7 +67,7 @@ describe "Wp2txt" do
|
|
65
67
|
it "replaces \\r\\n and <br /> inside [ref] ... [/ref] to ' '" do
|
66
68
|
str_before = "[ref]...\r\n...<br />...[/ref]"
|
67
69
|
str_after = "... ... ..."
|
68
|
-
format_ref(str_before).
|
70
|
+
expect(format_ref(str_before)).to eq str_after
|
69
71
|
end
|
70
72
|
end
|
71
73
|
|
@@ -73,7 +75,7 @@ describe "Wp2txt" do
|
|
73
75
|
it "replaces <ref> tag with [ref]" do
|
74
76
|
str_before = "<ref> ... <br /> ... </ref> \n <ref />"
|
75
77
|
str_after = "[ref] ... \n ... [/ref] \n "
|
76
|
-
make_reference(str_before).
|
78
|
+
expect(make_reference(str_before)).to eq str_after
|
77
79
|
end
|
78
80
|
end
|
79
81
|
|
@@ -81,7 +83,7 @@ describe "Wp2txt" do
|
|
81
83
|
it "removes table formated parts" do
|
82
84
|
str_before = "{| ... \n{| ... \n ...|}\n ...|}"
|
83
85
|
str_after = ""
|
84
|
-
remove_table(str_before).
|
86
|
+
expect(remove_table(str_before)).to eq str_after
|
85
87
|
end
|
86
88
|
end
|
87
89
|
|
@@ -89,7 +91,7 @@ describe "Wp2txt" do
|
|
89
91
|
it "removes clade formated parts" do
|
90
92
|
str_before = "\{\{clade ... \n ... \n ... \n\}\}"
|
91
93
|
str_after = ""
|
92
|
-
remove_clade(str_before).
|
94
|
+
expect(remove_clade(str_before)).to eq str_after
|
93
95
|
end
|
94
96
|
end
|
95
97
|
|
@@ -97,7 +99,7 @@ describe "Wp2txt" do
|
|
97
99
|
it "removes horizontal lines" do
|
98
100
|
str_before = "\n----\n--\n--\n"
|
99
101
|
str_after = "\n\n"
|
100
|
-
remove_hr(str_before).
|
102
|
+
expect(remove_hr(str_before)).to eq str_after
|
101
103
|
end
|
102
104
|
end
|
103
105
|
|
@@ -105,10 +107,10 @@ describe "Wp2txt" do
|
|
105
107
|
it "removes tags" do
|
106
108
|
str_before = "<tag>abc</tag>"
|
107
109
|
str_after = "abc"
|
108
|
-
remove_tag(str_before).
|
110
|
+
expect(remove_tag(str_before)).to eq str_after
|
109
111
|
str_before = "[tag]def[/tag]"
|
110
112
|
str_after = "def"
|
111
|
-
remove_tag(str_before, ['[', ']']).
|
113
|
+
expect(remove_tag(str_before, ['[', ']'])).to eq str_after
|
112
114
|
end
|
113
115
|
end
|
114
116
|
|
@@ -116,7 +118,7 @@ describe "Wp2txt" do
|
|
116
118
|
it "removes directive" do
|
117
119
|
str_before = "__abc__\n __def__"
|
118
120
|
str_after = "\n "
|
119
|
-
remove_directive(str_before).
|
121
|
+
expect(remove_directive(str_before)).to eq str_after
|
120
122
|
end
|
121
123
|
end
|
122
124
|
|
@@ -124,7 +126,7 @@ describe "Wp2txt" do
|
|
124
126
|
it "removes directive" do
|
125
127
|
str_before = "''abc''\n'''def'''"
|
126
128
|
str_after = "abc\ndef"
|
127
|
-
remove_emphasis(str_before).
|
129
|
+
expect(remove_emphasis(str_before)).to eq str_after
|
128
130
|
end
|
129
131
|
end
|
130
132
|
|
@@ -132,7 +134,7 @@ describe "Wp2txt" do
|
|
132
134
|
it "replaces <nowiki>...</nowiki> with <nowiki-object_id>" do
|
133
135
|
str_before = "<nowiki>[[abc]]</nowiki>def<nowiki>[[ghi]]</nowiki>"
|
134
136
|
str_after = Regexp.new("<nowiki-\\d+>def<nowiki-\\d+>")
|
135
|
-
escape_nowiki(str_before).
|
137
|
+
expect(escape_nowiki(str_before)).to match str_after
|
136
138
|
end
|
137
139
|
end
|
138
140
|
|
@@ -141,24 +143,24 @@ describe "Wp2txt" do
|
|
141
143
|
@nowikis = {123 => "[[abc]]", 124 => "[[ghi]]"}
|
142
144
|
str_before = "<nowiki-123>def<nowiki-124>"
|
143
145
|
str_after = "[[abc]]def[[ghi]]"
|
144
|
-
unescape_nowiki(str_before).
|
146
|
+
expect(unescape_nowiki(str_before)).to eq str_after
|
145
147
|
end
|
146
148
|
end
|
147
149
|
|
148
150
|
describe "process_interwiki_links" do
|
149
151
|
it "formats text link and remove brackets" do
|
150
|
-
process_interwiki_links("[[a b]]").
|
151
|
-
process_interwiki_links("[[a b|c]]").
|
152
|
-
process_interwiki_links("[[a|b|c]]").
|
153
|
-
process_interwiki_links("[[硬口蓋鼻音|[ɲ], /J/]]").
|
152
|
+
expect(process_interwiki_links("[[a b]]")).to eq "a b"
|
153
|
+
expect(process_interwiki_links("[[a b|c]]")).to eq "c"
|
154
|
+
expect(process_interwiki_links("[[a|b|c]]")).to eq "b|c"
|
155
|
+
expect(process_interwiki_links("[[硬口蓋鼻音|[ɲ], /J/]]")).to eq "[ɲ], /J/"
|
154
156
|
end
|
155
157
|
end
|
156
158
|
|
157
159
|
describe "process_external_links" do
|
158
160
|
it "formats text link and remove brackets" do
|
159
|
-
process_external_links("[http://yohasebe.com yohasebe.com]").
|
160
|
-
process_external_links("[http://yohasebe.com]").
|
161
|
-
process_external_links("* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}").
|
161
|
+
expect(process_external_links("[http://yohasebe.com yohasebe.com]")).to eq "yohasebe.com"
|
162
|
+
expect(process_external_links("[http://yohasebe.com]")).to eq "http://yohasebe.com"
|
163
|
+
expect(process_external_links("* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}")).to eq "* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}"
|
162
164
|
end
|
163
165
|
end
|
164
166
|
|
@@ -166,30 +168,30 @@ describe "Wp2txt" do
|
|
166
168
|
it "removes brackets and leaving some text" do
|
167
169
|
str_before = "{{}}"
|
168
170
|
str_after = ""
|
169
|
-
process_template(str_before).
|
171
|
+
expect(process_template(str_before)).to eq str_after
|
170
172
|
str_before = "{{lang|en|Japan}}"
|
171
173
|
str_after = "Japan"
|
172
|
-
process_template(str_before).
|
174
|
+
expect(process_template(str_before)).to eq str_after
|
173
175
|
str_before = "{{a|b=c|d=f}}"
|
174
176
|
str_after = "a"
|
175
|
-
process_template(str_before).
|
177
|
+
expect(process_template(str_before)).to eq str_after
|
176
178
|
str_before = "{{a|b|{{c|d|e}}}}"
|
177
179
|
str_after = "e"
|
178
|
-
process_template(str_before).
|
180
|
+
expect(process_template(str_before)).to eq str_after
|
179
181
|
end
|
180
182
|
end
|
181
183
|
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
<span class=\"infl-inline\"><b class=\"Latn \" lang=\"en\">kick</b> (''third-person singular simple present'' <span class=\"form-of third-person-singular-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicks#English|kicks]]</span>'''</span>, ''present participle'' <span class=\"form-of present-participle-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicking#English|kicking]]</span>'''</span>, ''simple past and past participle'' <span class=\"form-of simple-past-and-participle-form-of\"> '''<span class=\"Latn \" lang=\"en\">[[kicked#English|kicked]]</span>'''</span>)</span>[[Category:English verbs|kick]]
|
190
|
-
EOD
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
184
|
+
# describe "expand_template" do
|
185
|
+
# it "gets data corresponding to a given template using mediawiki api" do
|
186
|
+
# uri = "http://en.wiktionary.org/w/api.php"
|
187
|
+
# template = "{{en-verb}}"
|
188
|
+
# word = "kick"
|
189
|
+
# expanded = expand_template(uri, template, word)
|
190
|
+
# html =<<EOD
|
191
|
+
# <span class=\"infl-inline\"><b class=\"Latn \" lang=\"en\">kick</b> (''third-person singular simple present'' <span class=\"form-of third-person-singular-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicks#English|kicks]]</span>'''</span>, ''present participle'' <span class=\"form-of present-participle-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicking#English|kicking]]</span>'''</span>, ''simple past and past participle'' <span class=\"form-of simple-past-and-participle-form-of\"> '''<span class=\"Latn \" lang=\"en\">[[kicked#English|kicked]]</span>'''</span>)</span>[[Category:English verbs|kick]]
|
192
|
+
# EOD
|
193
|
+
# html.strip!
|
194
|
+
# expanded.should == html
|
195
|
+
# end
|
196
|
+
# end
|
195
197
|
end
|
data/wp2txt.gemspec
CHANGED
@@ -24,10 +24,10 @@ Gem::Specification.new do |s|
|
|
24
24
|
|
25
25
|
s.add_dependency "nokogiri"
|
26
26
|
s.add_dependency "sanitize"
|
27
|
-
if RUBY_VERSION >= '2.0'
|
28
|
-
|
29
|
-
else
|
30
|
-
|
31
|
-
end
|
27
|
+
# if RUBY_VERSION >= '2.0'
|
28
|
+
# s.add_dependency "bzip2-ruby-rb20"
|
29
|
+
# else
|
30
|
+
# s.add_dependency "bzip2-ruby"
|
31
|
+
# end
|
32
32
|
s.add_dependency "trollop"
|
33
33
|
end
|
metadata
CHANGED
@@ -1,111 +1,97 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wp2txt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2014-10-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '0'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- -
|
24
|
+
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rspec
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- -
|
31
|
+
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: '0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- -
|
38
|
+
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: rake
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- -
|
45
|
+
- - ">="
|
46
46
|
- !ruby/object:Gem::Version
|
47
47
|
version: '0'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- -
|
52
|
+
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: nokogiri
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- -
|
59
|
+
- - ">="
|
60
60
|
- !ruby/object:Gem::Version
|
61
61
|
version: '0'
|
62
62
|
type: :runtime
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- -
|
66
|
+
- - ">="
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: sanitize
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
|
-
- -
|
73
|
+
- - ">="
|
74
74
|
- !ruby/object:Gem::Version
|
75
75
|
version: '0'
|
76
76
|
type: :runtime
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
|
-
- -
|
81
|
-
- !ruby/object:Gem::Version
|
82
|
-
version: '0'
|
83
|
-
- !ruby/object:Gem::Dependency
|
84
|
-
name: bzip2-ruby-rb20
|
85
|
-
requirement: !ruby/object:Gem::Requirement
|
86
|
-
requirements:
|
87
|
-
- - '>='
|
88
|
-
- !ruby/object:Gem::Version
|
89
|
-
version: '0'
|
90
|
-
type: :runtime
|
91
|
-
prerelease: false
|
92
|
-
version_requirements: !ruby/object:Gem::Requirement
|
93
|
-
requirements:
|
94
|
-
- - '>='
|
80
|
+
- - ">="
|
95
81
|
- !ruby/object:Gem::Version
|
96
82
|
version: '0'
|
97
83
|
- !ruby/object:Gem::Dependency
|
98
84
|
name: trollop
|
99
85
|
requirement: !ruby/object:Gem::Requirement
|
100
86
|
requirements:
|
101
|
-
- -
|
87
|
+
- - ">="
|
102
88
|
- !ruby/object:Gem::Version
|
103
89
|
version: '0'
|
104
90
|
type: :runtime
|
105
91
|
prerelease: false
|
106
92
|
version_requirements: !ruby/object:Gem::Requirement
|
107
93
|
requirements:
|
108
|
-
- -
|
94
|
+
- - ">="
|
109
95
|
- !ruby/object:Gem::Version
|
110
96
|
version: '0'
|
111
97
|
description: WP2TXT extracts plain text data from Wikipedia dump file (encoded in
|
@@ -117,7 +103,7 @@ executables:
|
|
117
103
|
extensions: []
|
118
104
|
extra_rdoc_files: []
|
119
105
|
files:
|
120
|
-
- .gitignore
|
106
|
+
- ".gitignore"
|
121
107
|
- Gemfile
|
122
108
|
- LICENSE
|
123
109
|
- README.md
|
@@ -142,17 +128,17 @@ require_paths:
|
|
142
128
|
- lib
|
143
129
|
required_ruby_version: !ruby/object:Gem::Requirement
|
144
130
|
requirements:
|
145
|
-
- -
|
131
|
+
- - ">="
|
146
132
|
- !ruby/object:Gem::Version
|
147
133
|
version: '0'
|
148
134
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
149
135
|
requirements:
|
150
|
-
- -
|
136
|
+
- - ">="
|
151
137
|
- !ruby/object:Gem::Version
|
152
138
|
version: '0'
|
153
139
|
requirements: []
|
154
140
|
rubyforge_project: wp2txt
|
155
|
-
rubygems_version: 2.1
|
141
|
+
rubygems_version: 2.4.1
|
156
142
|
signing_key:
|
157
143
|
specification_version: 4
|
158
144
|
summary: Wikipedia dump to text converter
|