wp2txt 0.5.4 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 07ab0b6030dcc43512ef8f17504519dc06f72a25
4
- data.tar.gz: 915a4ea67ab2bfbe7b465d3d6d718c30a02b463f
3
+ metadata.gz: c05af7e5c72b073f18b53eca8619212f0928aaa1
4
+ data.tar.gz: 02dc116458041b096fd811b3fec6ffb6e6ff3ee7
5
5
  SHA512:
6
- metadata.gz: f2d66c66a8e87dc8d35c7627729e44dc3a9d6c93624a3055e925ad293fbf21aff3e012dfd4e00d14b815e9b8ee7f303b704430555a7525bb497d8ae11f390968
7
- data.tar.gz: 75f6245efe1b55dfcb0612b49d8b2fd7e96cf3fb2dbe729abbcc3761f6d7431f2e9408c8970bf310e96626819e884ce681bfb9685462df3461ca0f494e0d483d
6
+ metadata.gz: 26479a43e8e3ccebfb6578562d62b7b6d2234e924838774a11d143c8fad7e1952db0278526d202a77aad680926b96baa25c8da9e0c23f916d95ccf614d5e82a3
7
+ data.tar.gz: 8959c9b51efb18386cf556c67439332fc43723d8de8abfc014745d0d0ad1f89ba56b6222b6f8d2a33748343e2dbef93752ee104c8a54c57e01fe21bee7e2f892
data/.gitignore CHANGED
@@ -9,7 +9,7 @@ _yardoc
9
9
  coverage
10
10
  doc/
11
11
  lib/bundler/man
12
- pkg
12
+ pkg/
13
13
  rdoc
14
14
  spec/reports
15
15
  test/tmp
data/README.md CHANGED
@@ -2,8 +2,6 @@
2
2
 
3
3
  Wikipedia dump file to text converter
4
4
 
5
- CAUTION: This software is on an experimental stage. Use with care!
6
-
7
5
  ### About ###
8
6
 
9
7
  WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata. It is originally intended to be useful for researchers who look for an easy way to obtain open-source multi-lingual corpora, but may be handy for other purposes.
@@ -14,15 +12,19 @@ WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compres
14
12
  * Create output files of specified size.
15
13
  * Allow users to specify text elements to be extracted/converted (page titles, section titles, lists, and tables).
16
14
 
17
- WP2TXT before version 0.4.0 came with Mac/Windows GUI. Now it's become a pure command-line application--Sorry GUI folks, but there seems more demand for an easy-to-hack CUI package than a not-very-flexible GUI app.
18
-
19
15
  ### Installation
20
16
 
21
17
  $ gem install wp2txt
22
18
 
19
+ It is highly recommended you also install bz2-ruby gem. See the following for the details about bz2-ruby gem:
20
+
21
+ [https://github.com/brianmario/bzip2-ruby](https://github.com/brianmario/bzip2-ruby)
22
+
23
+ When the above gem is not found, wp2txt will try to use bzip2 program in your command line environment. Supposedly he former option is more reliable as well as fast.
24
+
23
25
  ### Usage
24
26
 
25
- Obtain a Wikipedia dump file (see the link below) with a file name such as:
27
+ Obtain a Wikipedia dump file (from [here](http://dumps.wikimedia.org/backup-index.html)) with a file name such as:
26
28
 
27
29
  xxwiki-yyyymmdd-pages-articles.xml.bz2
28
30
 
@@ -30,24 +32,31 @@ where `xx` is language code such as "en (English)" or "ja (Japanese)", and `yyy
30
32
 
31
33
  Command line options are as follows:
32
34
 
33
- Usage: wp2txt [options]
34
- where [options] are:
35
- --input-file, -i: Wikipedia dump file with .bz2 (compressed) or .txt (uncompressed) format
36
- --output-dir, -o <s>: Output directory (default: Present working directory)
37
- --convert-off, -c: Output XML (without converting to plain text)
38
- --list-off, -l: Exclude list items from output
39
- --heading-off, -d: Exclude section titles from output
40
- --title-off, -t: Exclude page titles from output
41
- --table-off, --no-table-off, -a: Exclude page titles from output (default: true)
42
- --template-off, --no-template-off, -e: Remove template notations from output (default: true)
43
- --redirect-off, -r: Not show redirect destination
44
- --strip-marker, -s: Remove symbols prefixed to list items, definitions, etc.
45
- --category-off, -g: Not show article category information
46
- --file-size, -f <i>: Approximate size (in MB) of each output file (default: 10)
47
- --version, -v: Print version and exit
48
- --help, -h: Show this message
49
-
50
- ### Limitations ###
35
+ *CAUTION:* command line options in the current version have been drastically changed from those in versions 0.5!
36
+
37
+ Usage: wp2txt [options]
38
+ where [options] are:
39
+ --input-file, -i: Wikipedia dump file with .bz2 (compressed) or
40
+ .txt (uncompressed) format
41
+ --output-dir, -o <s>: Output directory (default:
42
+ /Users/yohasebe/Dropbox/code/wp2txt)
43
+ --convert, --no-convert, -c: Output in plain text (converting from XML)
44
+ (default: true)
45
+ --list, --no-list, -l: Show list items in output (default: true)
46
+ --heading, --no-heading, -d: Show section titles in output (default: true)
47
+ --title, --no-title, -t: Show page titles in output (default: true)
48
+ --table, -a: Show table source code in output
49
+ --template, -e: Show template specifications in output
50
+ --redirect, -r: Show redirect destination
51
+ --marker, --no-marker, -m: Show symbols prefixed to list items,
52
+ definitions, etc. (Default: true)
53
+ --category, -g: Show article category information
54
+ --file-size, -f <i>: Approximate size (in MB) of each output file
55
+ (default: 10)
56
+ --version, -v: Print version and exit
57
+ --help, -h: Show this message
58
+
59
+ ### Caveats ###
51
60
 
52
61
  * Certain types of data such as mathematical equations and computer source code are not be properly converted. Please remember this software is originally intended for correcting “sentences” for linguistic studies.
53
62
  * Extraction of normal text data could sometimes fail for various reasons (e.g. illegal matching of begin/end tags, language-specific conventions of formatting, etc).
data/bin/wp2txt CHANGED
@@ -26,15 +26,15 @@ EOS
26
26
 
27
27
  opt :input_file, "Wikipedia dump file with .bz2 (compressed) or .txt (uncompressed) format", :required => true
28
28
  opt :output_dir, "Output directory", :default => Dir::pwd, :type => String
29
- opt :convert_off, "Output XML (without converting to plain text)", :default => false
30
- opt :list_off, "Exclude list items from output", :default => false
31
- opt :heading_off, "Exclude section titles from output", :default => false, :short => "-d"
32
- opt :title_off, "Exclude page titles from output", :default => false
33
- opt :table_off, "Exclude page titles from output", :default => true
34
- opt :template_off, "Remove template notations from output", :default => true
35
- opt :redirect_off, "Not show redirect destination", :default => false
36
- opt :strip_marker, "Remove symbols prefixed to list items, definitions, etc.", :default => false
37
- opt :category_off, "Not show article category information", :default => false
29
+ opt :convert, "Output in plain text (converting from XML)", :default => true
30
+ opt :list, "Show list items in output", :default => true
31
+ opt :heading, "Show section titles in output", :default => true, :short => "-d"
32
+ opt :title, "Show page titles in output", :default => true
33
+ opt :table, "Show table source code in output", :default => false
34
+ opt :template, "Show template specifications in output", :default => false
35
+ opt :redirect, "Show redirect destination", :default => false
36
+ opt :marker, "Show symbols prefixed to list items, definitions, etc.", :default => true
37
+ opt :category, "Show article category information", :default => false
38
38
  opt :file_size, "Approximate size (in MB) of each output file", :default => 10
39
39
  end
40
40
  Trollop::die :size, "must be larger than 0" unless opts[:file_size] >= 0
@@ -43,9 +43,9 @@ Trollop::die :output_dir, "must exist" unless File.exist?(opts[:output_dir])
43
43
  input_file = ARGV[0]
44
44
  output_dir = opts[:output_dir]
45
45
  tfile_size = opts[:file_size]
46
- convert_off = opts[:convert_off]
47
- strip_tmarker = opts[:strip_marker]
48
- opt_array = [:title_off, :list_off, :heading_off, :table_off, :template_off, :redirect_off]
46
+ convert = opts[:convert]
47
+ strip_tmarker = opts[:marker] ? false : true
48
+ opt_array = [:title_off, :list, :heading, :table, :template, :redirect]
49
49
  config = {}
50
50
  opt_array.each do |opt|
51
51
  config[opt] = opts[opt]
@@ -54,13 +54,13 @@ end
54
54
  # a "parent" is either commandline progress bar or
55
55
  # a gui window (not available for now)
56
56
  parent = Wp2txt::CmdProgbar.new
57
- wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert_off, strip_tmarker)
57
+ wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker)
58
58
 
59
59
  wpconv.extract_text do |article|
60
60
  title = format_wiki article.title
61
61
  title = "[[#{title}]]\n"
62
62
 
63
- if !opts[:category_off] && !article.categories.empty?
63
+ if opts[:category] && !article.categories.empty?
64
64
  contents = "\nCATEGORIES: "
65
65
  contents += article.categories.join(", ")
66
66
  contents += "\n\n"
@@ -71,31 +71,31 @@ wpconv.extract_text do |article|
71
71
  article.elements.each do |e|
72
72
  case e.first
73
73
  when :mw_heading
74
- next if config[:heading_off]
74
+ next if !config[:heading]
75
75
  line = format_wiki(e.last)
76
76
  line += "+HEADING+" if $DEBUG_MODE
77
77
  when :mw_paragraph
78
- next if config[:paragraph_off]
78
+ # next if !config[:paragraph]
79
79
  line = format_wiki(e.last)
80
80
  line += "+PARAGRAPH+" if $DEBUG_MODE
81
81
  when :mw_table, :mw_htable
82
- next if config[:table_off]
82
+ next if !config[:table]
83
83
  line = format_wiki(e.last)
84
84
  line += "+TABLE+" if $DEBUG_MODE
85
85
  when :mw_pre
86
- next if config[:pre_off]
86
+ next if !config[:pre]
87
87
  line = e.last
88
88
  line += "+PRE+" if $DEBUG_MODE
89
89
  when :mw_quote
90
- next if config[:quote_off]
90
+ # next if !config[:quote]
91
91
  line = format_wiki(e.last)
92
92
  line += "+QUOTE+" if $DEBUG_MODE
93
93
  when :mw_unordered, :mw_ordered, :mw_definition
94
- next if config[:list_off]
94
+ next if !config[:list]
95
95
  line = format_wiki(e.last)
96
96
  line += "+LIST+" if $DEBUG_MODE
97
97
  when :mw_redirect
98
- next if config[:redirect_off]
98
+ next if !config[:redirect]
99
99
  line = format_wiki(e.last)
100
100
  line += "+REDIRECT+" if $DEBUG_MODE
101
101
  line += "\n\n"
@@ -108,14 +108,14 @@ wpconv.extract_text do |article|
108
108
  end
109
109
  end
110
110
  contents += line
111
- contents = remove_templates(contents) if config[:template_off]
111
+ contents = remove_templates(contents) unless config[:template]
112
112
  end
113
113
 
114
114
  ##### cleanup #####
115
115
  if /\A\s*\z/m =~ contents
116
116
  result = ""
117
117
  else
118
- result = config[:title_off] ? contents : title + "\n" + contents
118
+ result = config[:title] ? title + "\n" + contents : contents
119
119
  end
120
120
  result = result.gsub(/\[ref\]\s*\[\/ref\]/m){""}
121
121
  result = result.gsub(/\n\n\n+/m){"\n\n"} + "\n"
@@ -1,3 +1,3 @@
1
1
  module Wp2txt
2
- VERSION = "0.5.4"
2
+ VERSION = "0.6.0"
3
3
  end
data/lib/wp2txt.rb CHANGED
@@ -25,16 +25,16 @@ module Wp2txt
25
25
 
26
26
  include Wp2txt
27
27
 
28
- # attr_accessor :pause_flag, :stop_flag, :outfiles, :convert_off
28
+ # attr_accessor :pause_flag, :stop_flag, :outfiles, :convert
29
29
 
30
- def initialize(parent, input_file, output_dir = ".", tfile_size = 10, convert_off = false, strip_tmarker = false)
30
+ def initialize(parent, input_file, output_dir = ".", tfile_size = 10, convert = true, strip_tmarker = false)
31
31
  @parent = parent
32
32
  @fp = nil
33
33
 
34
34
  @input_file = input_file
35
35
  @output_dir = output_dir
36
36
  @tfile_size = tfile_size
37
- @convert_off = convert_off
37
+ @convert = convert
38
38
  @strip_tmarker = strip_tmarker
39
39
  end
40
40
 
@@ -213,16 +213,15 @@ module Wp2txt
213
213
  # call this method to do the job
214
214
  def extract_text(&block)
215
215
  prepare
216
- # output the original xml only split to files of the specified size
217
- if @convert_off
218
- extract
219
- # convert xml to plain text
220
- else
216
+ if @convert
221
217
  if block
222
218
  extract_and_convert(&block)
223
219
  else
224
220
  extract_and_convert
225
221
  end
222
+ else
223
+ # output the original xml only split to files of the specified size
224
+ extract
226
225
  end
227
226
  end
228
227
 
data/spec/utils_spec.rb CHANGED
@@ -23,7 +23,7 @@ describe "Wp2txt" do
23
23
  str_processed = process_nested_structure(scanner, "[[", "]]") do |content|
24
24
  "<<" + content + ">>"
25
25
  end
26
- str_processed.should == str_after
26
+ expect(str_processed).to eq str_after
27
27
 
28
28
  str_before = "#* {{quote-book|1503|year_published=1836|chapter=19 Henry VII. c. 5: Coin||A Collection of Statutes Connected with the General Administration of the Law|page=158|url=http://books.google.com/books?id=QtYuAAAAIAAJ
29
29
  |passage={{...}} every of them, being gold, whole and weight, shall '''go''' and be current in payment throughout this his realm for the sum that they were coined for.}}"
@@ -33,7 +33,9 @@ describe "Wp2txt" do
33
33
  str_processed = process_nested_structure(scanner, "{{", "}}") do |content|
34
34
  "<<" + content + ">>"
35
35
  end
36
- str_processed.should == str_after
36
+ #str_processed.should == str_after
37
+ expect(str_processed).to eq str_after
38
+
37
39
  end
38
40
  end
39
41
 
@@ -41,7 +43,7 @@ describe "Wp2txt" do
41
43
  it "replaces character references with real characters" do
42
44
  str_before = "&nbsp; &lt; &gt; &amp; &quot;"
43
45
  str_after = " < > & \""
44
- special_chr(str_before).should == str_after
46
+ expect(special_chr(str_before)).to eq str_after
45
47
  end
46
48
  end
47
49
 
@@ -49,7 +51,7 @@ describe "Wp2txt" do
49
51
  it "replaces character references with real characters" do
50
52
  str_before = "&#x266A;"
51
53
  str_after = "♪"
52
- chrref_to_utf(str_before).should == str_after
54
+ expect(chrref_to_utf(str_before)).to eq str_after
53
55
  end
54
56
  end
55
57
 
@@ -57,7 +59,7 @@ describe "Wp2txt" do
57
59
  it "replaces {mdash}, {ndash}, or {–} with '–'" do
58
60
  str_before = "{mdash} {ndash} {–}"
59
61
  str_after = "– – –"
60
- mndash(str_before).should == str_after
62
+ expect(mndash(str_before)).to eq str_after
61
63
  end
62
64
  end
63
65
 
@@ -65,7 +67,7 @@ describe "Wp2txt" do
65
67
  it "replaces \\r\\n and <br /> inside [ref] ... [/ref] to ' '" do
66
68
  str_before = "[ref]...\r\n...<br />...[/ref]"
67
69
  str_after = "... ... ..."
68
- format_ref(str_before).should == str_after
70
+ expect(format_ref(str_before)).to eq str_after
69
71
  end
70
72
  end
71
73
 
@@ -73,7 +75,7 @@ describe "Wp2txt" do
73
75
  it "replaces <ref> tag with [ref]" do
74
76
  str_before = "<ref> ... <br /> ... </ref> \n <ref />"
75
77
  str_after = "[ref] ... \n ... [/ref] \n "
76
- make_reference(str_before).should == str_after
78
+ expect(make_reference(str_before)).to eq str_after
77
79
  end
78
80
  end
79
81
 
@@ -81,7 +83,7 @@ describe "Wp2txt" do
81
83
  it "removes table formated parts" do
82
84
  str_before = "{| ... \n{| ... \n ...|}\n ...|}"
83
85
  str_after = ""
84
- remove_table(str_before).should == str_after
86
+ expect(remove_table(str_before)).to eq str_after
85
87
  end
86
88
  end
87
89
 
@@ -89,7 +91,7 @@ describe "Wp2txt" do
89
91
  it "removes clade formated parts" do
90
92
  str_before = "\{\{clade ... \n ... \n ... \n\}\}"
91
93
  str_after = ""
92
- remove_clade(str_before).should == str_after
94
+ expect(remove_clade(str_before)).to eq str_after
93
95
  end
94
96
  end
95
97
 
@@ -97,7 +99,7 @@ describe "Wp2txt" do
97
99
  it "removes horizontal lines" do
98
100
  str_before = "\n----\n--\n--\n"
99
101
  str_after = "\n\n"
100
- remove_hr(str_before).should == str_after
102
+ expect(remove_hr(str_before)).to eq str_after
101
103
  end
102
104
  end
103
105
 
@@ -105,10 +107,10 @@ describe "Wp2txt" do
105
107
  it "removes tags" do
106
108
  str_before = "<tag>abc</tag>"
107
109
  str_after = "abc"
108
- remove_tag(str_before).should == str_after
110
+ expect(remove_tag(str_before)).to eq str_after
109
111
  str_before = "[tag]def[/tag]"
110
112
  str_after = "def"
111
- remove_tag(str_before, ['[', ']']).should == str_after
113
+ expect(remove_tag(str_before, ['[', ']'])).to eq str_after
112
114
  end
113
115
  end
114
116
 
@@ -116,7 +118,7 @@ describe "Wp2txt" do
116
118
  it "removes directive" do
117
119
  str_before = "__abc__\n __def__"
118
120
  str_after = "\n "
119
- remove_directive(str_before).should == str_after
121
+ expect(remove_directive(str_before)).to eq str_after
120
122
  end
121
123
  end
122
124
 
@@ -124,7 +126,7 @@ describe "Wp2txt" do
124
126
  it "removes directive" do
125
127
  str_before = "''abc''\n'''def'''"
126
128
  str_after = "abc\ndef"
127
- remove_emphasis(str_before).should == str_after
129
+ expect(remove_emphasis(str_before)).to eq str_after
128
130
  end
129
131
  end
130
132
 
@@ -132,7 +134,7 @@ describe "Wp2txt" do
132
134
  it "replaces <nowiki>...</nowiki> with <nowiki-object_id>" do
133
135
  str_before = "<nowiki>[[abc]]</nowiki>def<nowiki>[[ghi]]</nowiki>"
134
136
  str_after = Regexp.new("<nowiki-\\d+>def<nowiki-\\d+>")
135
- escape_nowiki(str_before).should =~ str_after
137
+ expect(escape_nowiki(str_before)).to match str_after
136
138
  end
137
139
  end
138
140
 
@@ -141,24 +143,24 @@ describe "Wp2txt" do
141
143
  @nowikis = {123 => "[[abc]]", 124 => "[[ghi]]"}
142
144
  str_before = "<nowiki-123>def<nowiki-124>"
143
145
  str_after = "[[abc]]def[[ghi]]"
144
- unescape_nowiki(str_before).should == str_after
146
+ expect(unescape_nowiki(str_before)).to eq str_after
145
147
  end
146
148
  end
147
149
 
148
150
  describe "process_interwiki_links" do
149
151
  it "formats text link and remove brackets" do
150
- process_interwiki_links("[[a b]]").should == "a b"
151
- process_interwiki_links("[[a b|c]]").should == "c"
152
- process_interwiki_links("[[a|b|c]]").should == "b|c"
153
- process_interwiki_links("[[硬口蓋鼻音|[ɲ], /J/]]").should == "[ɲ], /J/"
152
+ expect(process_interwiki_links("[[a b]]")).to eq "a b"
153
+ expect(process_interwiki_links("[[a b|c]]")).to eq "c"
154
+ expect(process_interwiki_links("[[a|b|c]]")).to eq "b|c"
155
+ expect(process_interwiki_links("[[硬口蓋鼻音|[ɲ], /J/]]")).to eq "[ɲ], /J/"
154
156
  end
155
157
  end
156
158
 
157
159
  describe "process_external_links" do
158
160
  it "formats text link and remove brackets" do
159
- process_external_links("[http://yohasebe.com yohasebe.com]").should == "yohasebe.com"
160
- process_external_links("[http://yohasebe.com]").should == "http://yohasebe.com"
161
- process_external_links("* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}").should == "* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}"
161
+ expect(process_external_links("[http://yohasebe.com yohasebe.com]")).to eq "yohasebe.com"
162
+ expect(process_external_links("[http://yohasebe.com]")).to eq "http://yohasebe.com"
163
+ expect(process_external_links("* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}")).to eq "* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}"
162
164
  end
163
165
  end
164
166
 
@@ -166,30 +168,30 @@ describe "Wp2txt" do
166
168
  it "removes brackets and leaving some text" do
167
169
  str_before = "{{}}"
168
170
  str_after = ""
169
- process_template(str_before).should == str_after
171
+ expect(process_template(str_before)).to eq str_after
170
172
  str_before = "{{lang|en|Japan}}"
171
173
  str_after = "Japan"
172
- process_template(str_before).should == str_after
174
+ expect(process_template(str_before)).to eq str_after
173
175
  str_before = "{{a|b=c|d=f}}"
174
176
  str_after = "a"
175
- process_template(str_before).should == str_after
177
+ expect(process_template(str_before)).to eq str_after
176
178
  str_before = "{{a|b|{{c|d|e}}}}"
177
179
  str_after = "e"
178
- process_template(str_before).should == str_after
180
+ expect(process_template(str_before)).to eq str_after
179
181
  end
180
182
  end
181
183
 
182
- describe "expand_template" do
183
- it "gets data corresponding to a given template using mediawiki api" do
184
- uri = "http://en.wiktionary.org/w/api.php"
185
- template = "{{en-verb}}"
186
- word = "kick"
187
- expanded = expand_template(uri, template, word)
188
- html =<<EOD
189
- <span class=\"infl-inline\"><b class=\"Latn \" lang=\"en\">kick</b> (''third-person singular simple present'' <span class=\"form-of third-person-singular-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicks#English|kicks]]</span>'''</span>, ''present participle'' <span class=\"form-of present-participle-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicking#English|kicking]]</span>'''</span>, ''simple past and past participle'' <span class=\"form-of simple-past-and-participle-form-of\"> '''<span class=\"Latn \" lang=\"en\">[[kicked#English|kicked]]</span>'''</span>)</span>[[Category:English verbs|kick]]
190
- EOD
191
- html.strip!
192
- expanded.should == html
193
- end
194
- end
184
+ # describe "expand_template" do
185
+ # it "gets data corresponding to a given template using mediawiki api" do
186
+ # uri = "http://en.wiktionary.org/w/api.php"
187
+ # template = "{{en-verb}}"
188
+ # word = "kick"
189
+ # expanded = expand_template(uri, template, word)
190
+ # html =<<EOD
191
+ # <span class=\"infl-inline\"><b class=\"Latn \" lang=\"en\">kick</b> (''third-person singular simple present'' <span class=\"form-of third-person-singular-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicks#English|kicks]]</span>'''</span>, ''present participle'' <span class=\"form-of present-participle-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicking#English|kicking]]</span>'''</span>, ''simple past and past participle'' <span class=\"form-of simple-past-and-participle-form-of\"> '''<span class=\"Latn \" lang=\"en\">[[kicked#English|kicked]]</span>'''</span>)</span>[[Category:English verbs|kick]]
192
+ # EOD
193
+ # html.strip!
194
+ # expanded.should == html
195
+ # end
196
+ # end
195
197
  end
data/wp2txt.gemspec CHANGED
@@ -24,10 +24,10 @@ Gem::Specification.new do |s|
24
24
 
25
25
  s.add_dependency "nokogiri"
26
26
  s.add_dependency "sanitize"
27
- if RUBY_VERSION >= '2.0'
28
- s.add_dependency "bzip2-ruby-rb20"
29
- else
30
- s.add_dependency "bzip2-ruby"
31
- end
27
+ # if RUBY_VERSION >= '2.0'
28
+ # s.add_dependency "bzip2-ruby-rb20"
29
+ # else
30
+ # s.add_dependency "bzip2-ruby"
31
+ # end
32
32
  s.add_dependency "trollop"
33
33
  end
metadata CHANGED
@@ -1,111 +1,97 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wp2txt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.4
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-12-11 00:00:00.000000000 Z
11
+ date: 2014-10-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - '>='
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
19
  version: '0'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - '>='
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rspec
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - '>='
31
+ - - ">="
32
32
  - !ruby/object:Gem::Version
33
33
  version: '0'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - '>='
38
+ - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: rake
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - '>='
45
+ - - ">="
46
46
  - !ruby/object:Gem::Version
47
47
  version: '0'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - '>='
52
+ - - ">="
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: nokogiri
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - '>='
59
+ - - ">="
60
60
  - !ruby/object:Gem::Version
61
61
  version: '0'
62
62
  type: :runtime
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - '>='
66
+ - - ">="
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: sanitize
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
- - - '>='
73
+ - - ">="
74
74
  - !ruby/object:Gem::Version
75
75
  version: '0'
76
76
  type: :runtime
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
- - - '>='
81
- - !ruby/object:Gem::Version
82
- version: '0'
83
- - !ruby/object:Gem::Dependency
84
- name: bzip2-ruby-rb20
85
- requirement: !ruby/object:Gem::Requirement
86
- requirements:
87
- - - '>='
88
- - !ruby/object:Gem::Version
89
- version: '0'
90
- type: :runtime
91
- prerelease: false
92
- version_requirements: !ruby/object:Gem::Requirement
93
- requirements:
94
- - - '>='
80
+ - - ">="
95
81
  - !ruby/object:Gem::Version
96
82
  version: '0'
97
83
  - !ruby/object:Gem::Dependency
98
84
  name: trollop
99
85
  requirement: !ruby/object:Gem::Requirement
100
86
  requirements:
101
- - - '>='
87
+ - - ">="
102
88
  - !ruby/object:Gem::Version
103
89
  version: '0'
104
90
  type: :runtime
105
91
  prerelease: false
106
92
  version_requirements: !ruby/object:Gem::Requirement
107
93
  requirements:
108
- - - '>='
94
+ - - ">="
109
95
  - !ruby/object:Gem::Version
110
96
  version: '0'
111
97
  description: WP2TXT extracts plain text data from Wikipedia dump file (encoded in
@@ -117,7 +103,7 @@ executables:
117
103
  extensions: []
118
104
  extra_rdoc_files: []
119
105
  files:
120
- - .gitignore
106
+ - ".gitignore"
121
107
  - Gemfile
122
108
  - LICENSE
123
109
  - README.md
@@ -142,17 +128,17 @@ require_paths:
142
128
  - lib
143
129
  required_ruby_version: !ruby/object:Gem::Requirement
144
130
  requirements:
145
- - - '>='
131
+ - - ">="
146
132
  - !ruby/object:Gem::Version
147
133
  version: '0'
148
134
  required_rubygems_version: !ruby/object:Gem::Requirement
149
135
  requirements:
150
- - - '>='
136
+ - - ">="
151
137
  - !ruby/object:Gem::Version
152
138
  version: '0'
153
139
  requirements: []
154
140
  rubyforge_project: wp2txt
155
- rubygems_version: 2.1.11
141
+ rubygems_version: 2.4.1
156
142
  signing_key:
157
143
  specification_version: 4
158
144
  summary: Wikipedia dump to text converter