wp2txt 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +20 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +65 -0
- data/Rakefile +9 -0
- data/bin/wp2txt +112 -0
- data/data/testdata.bz2 +0 -0
- data/lib/wp2txt.rb +323 -0
- data/lib/wp2txt/article.rb +177 -0
- data/lib/wp2txt/mw_api.rb +65 -0
- data/lib/wp2txt/progressbar.rb +305 -0
- data/lib/wp2txt/utils.rb +430 -0
- data/lib/wp2txt/version.rb +3 -0
- data/spec/spec_helper.rb +6 -0
- data/spec/utils_spec.rb +195 -0
- data/wp2txt.gemspec +26 -0
- metadata +145 -0
data/wp2txt.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "wp2txt/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "wp2txt"
|
7
|
+
s.version = Wp2txt::VERSION
|
8
|
+
s.authors = ["Yoichiro Hasebe"]
|
9
|
+
s.email = ["yohasebe@gmail.com"]
|
10
|
+
s.homepage = "http://github.com/yohasebe/wp2txt"
|
11
|
+
s.summary = %q{Wikipedia dump to text converter}
|
12
|
+
s.description = %q{WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata.}
|
13
|
+
|
14
|
+
s.rubyforge_project = "wp2txt"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
s.add_development_dependency "rspec"
|
22
|
+
s.add_runtime_dependency "sanitize"
|
23
|
+
s.add_runtime_dependency "bzip2-ruby"
|
24
|
+
s.add_runtime_dependency "trollop"
|
25
|
+
s.add_runtime_dependency "nokogiri"
|
26
|
+
end
|
metadata
ADDED
@@ -0,0 +1,145 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: wp2txt
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.4.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Yoichiro Hasebe
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-10-22 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rspec
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: sanitize
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: bzip2-ruby
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: trollop
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :runtime
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: nokogiri
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
type: :runtime
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
94
|
+
description: WP2TXT extracts plain text data from Wikipedia dump file (encoded in
|
95
|
+
XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata.
|
96
|
+
email:
|
97
|
+
- yohasebe@gmail.com
|
98
|
+
executables:
|
99
|
+
- wp2txt
|
100
|
+
extensions: []
|
101
|
+
extra_rdoc_files: []
|
102
|
+
files:
|
103
|
+
- .gitignore
|
104
|
+
- Gemfile
|
105
|
+
- LICENSE
|
106
|
+
- README.md
|
107
|
+
- Rakefile
|
108
|
+
- bin/wp2txt
|
109
|
+
- data/testdata.bz2
|
110
|
+
- lib/wp2txt.rb
|
111
|
+
- lib/wp2txt/article.rb
|
112
|
+
- lib/wp2txt/mw_api.rb
|
113
|
+
- lib/wp2txt/progressbar.rb
|
114
|
+
- lib/wp2txt/utils.rb
|
115
|
+
- lib/wp2txt/version.rb
|
116
|
+
- spec/spec_helper.rb
|
117
|
+
- spec/utils_spec.rb
|
118
|
+
- wp2txt.gemspec
|
119
|
+
homepage: http://github.com/yohasebe/wp2txt
|
120
|
+
licenses: []
|
121
|
+
post_install_message:
|
122
|
+
rdoc_options: []
|
123
|
+
require_paths:
|
124
|
+
- lib
|
125
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
126
|
+
none: false
|
127
|
+
requirements:
|
128
|
+
- - ! '>='
|
129
|
+
- !ruby/object:Gem::Version
|
130
|
+
version: '0'
|
131
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
132
|
+
none: false
|
133
|
+
requirements:
|
134
|
+
- - ! '>='
|
135
|
+
- !ruby/object:Gem::Version
|
136
|
+
version: '0'
|
137
|
+
requirements: []
|
138
|
+
rubyforge_project: wp2txt
|
139
|
+
rubygems_version: 1.8.24
|
140
|
+
signing_key:
|
141
|
+
specification_version: 3
|
142
|
+
summary: Wikipedia dump to text converter
|
143
|
+
test_files:
|
144
|
+
- spec/spec_helper.rb
|
145
|
+
- spec/utils_spec.rb
|