wikipedia-client 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +5 -0
- data/MIT-LICENSE +20 -0
- data/README.textile +81 -0
- data/Rakefile +60 -0
- data/VERSION +1 -0
- data/init.rb +1 -0
- data/install.rb +1 -0
- data/lib/wikipedia.rb +37 -0
- data/lib/wikipedia/client.rb +91 -0
- data/lib/wikipedia/configuration.rb +25 -0
- data/lib/wikipedia/page.rb +109 -0
- data/lib/wikipedia/url.rb +14 -0
- data/script/add_sanitization_test +22 -0
- data/spec/fixtures/Edsger_Dijkstra.json +1 -0
- data/spec/fixtures/Edsger_Dijkstra.yaml +184 -0
- data/spec/fixtures/Edsger_Dijkstra_section_0.json +1 -0
- data/spec/fixtures/Edsger_content.txt +1 -0
- data/spec/fixtures/File_Edsger_Wybe_Dijkstra_jpg.json +1 -0
- data/spec/fixtures/sanitization_samples/Ceawlin_of_Wessex-raw.txt +19 -0
- data/spec/fixtures/sanitization_samples/Ceawlin_of_Wessex-sanitized.txt +3 -0
- data/spec/fixtures/sanitization_samples/Edsger_W_Dijkstra-raw.txt +26 -0
- data/spec/fixtures/sanitization_samples/Edsger_W_Dijkstra-sanitized.txt +2 -0
- data/spec/fixtures/sanitization_samples/Flower_video_game-raw.txt +25 -0
- data/spec/fixtures/sanitization_samples/Flower_video_game-sanitized.txt +2 -0
- data/spec/fixtures/sanitization_samples/How_to_Lose_Friends__Alienate_People_film-raw.txt +28 -0
- data/spec/fixtures/sanitization_samples/How_to_Lose_Friends__Alienate_People_film-sanitized.txt +2 -0
- data/spec/fixtures/sanitization_samples/Kirsten_Dunst-raw.txt +16 -0
- data/spec/fixtures/sanitization_samples/Kirsten_Dunst-sanitized.txt +3 -0
- data/spec/fixtures/sanitization_samples/Large_Hadron_Collider-raw.txt +104 -0
- data/spec/fixtures/sanitization_samples/Large_Hadron_Collider-sanitized.txt +4 -0
- data/spec/fixtures/sanitization_samples/Metro_Goldwyn_Mayer-raw.txt +18 -0
- data/spec/fixtures/sanitization_samples/Metro_Goldwyn_Mayer-sanitized.txt +1 -0
- data/spec/fixtures/sanitization_samples/Middle_Ages-raw.txt +10 -0
- data/spec/fixtures/sanitization_samples/Middle_Ages-sanitized.txt +3 -0
- data/spec/fixtures/sanitization_samples/SMS_Elbing-raw.txt +51 -0
- data/spec/fixtures/sanitization_samples/SMS_Elbing-sanitized.txt +1 -0
- data/spec/fixtures/sanitization_samples/Sashimi-raw.txt +16 -0
- data/spec/fixtures/sanitization_samples/Sashimi-sanitized.txt +7 -0
- data/spec/fixtures/sanitization_samples/Superb_Fairywren-raw.txt +35 -0
- data/spec/fixtures/sanitization_samples/Superb_Fairywren-sanitized.txt +3 -0
- data/spec/fixtures/sanitization_samples/Velociraptor-raw.txt +28 -0
- data/spec/fixtures/sanitization_samples/Velociraptor-sanitized.txt +3 -0
- data/spec/lib/client_spec.rb +108 -0
- data/spec/lib/sanitize_spec.rb +14 -0
- data/spec/lib/url_spec.rb +8 -0
- data/spec/lib/wikipedia_spec.rb +20 -0
- data/spec/spec_helper.rb +4 -0
- data/tasks/wikipedia_tasks.rake +4 -0
- data/uninstall.rb +1 -0
- data/wikipedia-client.gemspec +96 -0
- metadata +134 -0
data/MIT-LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2008 [name of plugin creator]
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.textile
ADDED
@@ -0,0 +1,81 @@
|
|
1
|
+
h1. Wikipedia
|
2
|
+
|
3
|
+
Allows you to get wikipedia content through their API. This uses the
|
4
|
+
alpha API, not the deprecated query.php API type
|
5
|
+
|
6
|
+
Wikipedia API reference: "http://en.wikipedia.org/w/api.php":http://en.wikipedia.org/w/api.php
|
7
|
+
|
8
|
+
Adopted from: "http://code.google.com/p/wikipedia-client/":http://code.google.com/p/wikipedia-client/
|
9
|
+
|
10
|
+
h2. Examples
|
11
|
+
|
12
|
+
<pre><code>require 'wikipedia'
|
13
|
+
page = Wikipedia.find( 'Getting Things Done' )
|
14
|
+
|
15
|
+
=> #<Wikipedia:Page>
|
16
|
+
|
17
|
+
page.title
|
18
|
+
|
19
|
+
=> 'Getting Things Done'
|
20
|
+
|
21
|
+
page.content
|
22
|
+
|
23
|
+
=> # all the wiki markup appears here...
|
24
|
+
|
25
|
+
page.categories
|
26
|
+
|
27
|
+
=> [..., "Category:Self-help books", ...]
|
28
|
+
|
29
|
+
page.links
|
30
|
+
|
31
|
+
=> [..., "Business", "Cult following", ...]
|
32
|
+
|
33
|
+
page.images
|
34
|
+
|
35
|
+
=> ["File:Getting Things Done.jpg", ...]
|
36
|
+
|
37
|
+
page.image_urls
|
38
|
+
|
39
|
+
=> ["http://upload.wikimedia.org/wikipedia/en/e/e1/Getting_Things_Done.jpg", ...]]</code></pre>
|
40
|
+
|
41
|
+
h2. Configuration
|
42
|
+
|
43
|
+
This is by default configured like this:
|
44
|
+
|
45
|
+
<pre><code>Wikipedia.Configure {
|
46
|
+
domain 'en.wikipedia.org'
|
47
|
+
path 'w/api.php'
|
48
|
+
}</code></pre>
|
49
|
+
|
50
|
+
h2. Advanced
|
51
|
+
|
52
|
+
See the API spec at "http://en.wikipedia.org/w/api.php":http://en.wikipedia.org/w/api.php
|
53
|
+
|
54
|
+
If you need data that is not already present, you can override
|
55
|
+
parameters.
|
56
|
+
|
57
|
+
For example, to retrieve only the page info:
|
58
|
+
|
59
|
+
<pre><code>page = Wikipedia.find( 'Getting Things Done', :prop => "info" )
|
60
|
+
|
61
|
+
page.title
|
62
|
+
|
63
|
+
=> "Getting Things Done"
|
64
|
+
|
65
|
+
page.raw_data
|
66
|
+
|
67
|
+
=> {"query"=>{"pages"=>{"959928"=>{"pageid"=>959928, "ns"=>0,
|
68
|
+
"title"=>"Getting Things Done", "touched"=>"2010-03-10T00:04:09Z",
|
69
|
+
"lastrevid"=>348481810, "counter"=>0, "length"=>7891}}}}</code></pre>
|
70
|
+
|
71
|
+
h2. Running specs
|
72
|
+
|
73
|
+
if you have rspec >= 1.1.3 installed just type in
|
74
|
+
|
75
|
+
rake spec
|
76
|
+
|
77
|
+
h2. Thanks!
|
78
|
+
|
79
|
+
Copyright (c) 2008 [Cyril David], released under the MIT license
|
80
|
+
|
81
|
+
Adopted by Ken Pratt (ken@kenpratt.net) in 2010/03
|
data/Rakefile
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "wikipedia-client"
|
8
|
+
gem.summary = %Q{Ruby client for the Wikipedia API}
|
9
|
+
gem.description = %Q{Ruby client for the Wikipedia API}
|
10
|
+
gem.email = "christian.hellsten@gmail.com"
|
11
|
+
gem.homepage = "http://github.com/christianhellsten/wikipedia-client"
|
12
|
+
gem.authors = ["Cyril David", "Ken Pratt"]
|
13
|
+
gem.add_development_dependency "thoughtbot-shoulda", ">= 0"
|
14
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
15
|
+
end
|
16
|
+
Jeweler::GemcutterTasks.new
|
17
|
+
rescue LoadError
|
18
|
+
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
19
|
+
end
|
20
|
+
|
21
|
+
require 'rake/testtask'
|
22
|
+
Rake::TestTask.new(:test) do |test|
|
23
|
+
test.libs << 'lib' << 'test'
|
24
|
+
test.pattern = 'test/**/test_*.rb'
|
25
|
+
test.verbose = true
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
desc 'Test the wikipedia plugin.'
|
30
|
+
task :spec do
|
31
|
+
spec_path = File.expand_path(File.dirname(__FILE__) + '/spec/**/*.rb')
|
32
|
+
system("spec -cfs #{spec_path}")
|
33
|
+
end
|
34
|
+
|
35
|
+
begin
|
36
|
+
require 'rcov/rcovtask'
|
37
|
+
Rcov::RcovTask.new do |test|
|
38
|
+
test.libs << 'test'
|
39
|
+
test.pattern = 'test/**/test_*.rb'
|
40
|
+
test.verbose = true
|
41
|
+
end
|
42
|
+
rescue LoadError
|
43
|
+
task :rcov do
|
44
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
task :spec => :check_dependencies
|
49
|
+
|
50
|
+
task :default => :spec
|
51
|
+
|
52
|
+
require 'rake/rdoctask'
|
53
|
+
Rake::RDocTask.new do |rdoc|
|
54
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
55
|
+
|
56
|
+
rdoc.rdoc_dir = 'rdoc'
|
57
|
+
rdoc.title = "wikipedia-client #{version}"
|
58
|
+
rdoc.rdoc_files.include('README*')
|
59
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
60
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
1.0.0
|
data/init.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/lib/wikipedia'
|
data/install.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
puts File.read(File.dirname(__FILE__) + '/README')
|
data/lib/wikipedia.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
Dir[File.dirname(__FILE__) + '/wikipedia/**/*.rb'].each { |f| require f }
|
2
|
+
|
3
|
+
require 'uri'
|
4
|
+
|
5
|
+
module Wikipedia
|
6
|
+
# Examples :
|
7
|
+
# page = Wikipedia.find('Rails')
|
8
|
+
# => #<Wikipedia:0x123102>
|
9
|
+
# page.content
|
10
|
+
# => wiki content appears here
|
11
|
+
|
12
|
+
# basically just a wrapper for doing
|
13
|
+
# client = Wikipedia::Client.new
|
14
|
+
# client.find('Rails')
|
15
|
+
#
|
16
|
+
def self.find( page, options = {} )
|
17
|
+
client.find( page, options )
|
18
|
+
end
|
19
|
+
def self.find_image( title, options = {} )
|
20
|
+
client.find_image( title, options )
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.Configure(&block)
|
24
|
+
Configuration.instance.instance_eval(&block)
|
25
|
+
end
|
26
|
+
|
27
|
+
Configure {
|
28
|
+
domain 'en.wikipedia.org'
|
29
|
+
path 'w/api.php'
|
30
|
+
}
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def self.client
|
35
|
+
@client ||= Wikipedia::Client.new
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
module Wikipedia
|
2
|
+
class Client
|
3
|
+
# see http://en.wikipedia.org/w/api.php
|
4
|
+
BASE_URL = "http://:domain/:path?action=:action&format=json"
|
5
|
+
|
6
|
+
attr_accessor :follow_redirects
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
self.follow_redirects = true
|
10
|
+
end
|
11
|
+
|
12
|
+
def find( title, options = {} )
|
13
|
+
title = Url.new(title).title rescue title
|
14
|
+
page = Page.new( request_page( title, options ) )
|
15
|
+
while follow_redirects and page.redirect?
|
16
|
+
page = Page.new( request_page( page.redirect_title, options ))
|
17
|
+
end
|
18
|
+
page
|
19
|
+
end
|
20
|
+
|
21
|
+
def find_image( title, options = {} )
|
22
|
+
title = Url.new(title).title rescue title
|
23
|
+
Page.new( request_image( title, options ) )
|
24
|
+
end
|
25
|
+
|
26
|
+
# http://en.wikipedia.org/w/api.php?action=query&format=json&prop=revisions%7Clinks%7Cimages%7Ccategories&rvprop=content&titles=Flower%20(video%20game)
|
27
|
+
def request_page( title, options = {} )
|
28
|
+
request( {
|
29
|
+
:action => "query",
|
30
|
+
:prop => %w{ revisions links images categories },
|
31
|
+
:rvprop => "content",
|
32
|
+
:titles => title
|
33
|
+
}.merge( options ) )
|
34
|
+
end
|
35
|
+
|
36
|
+
# http://en.wikipedia.org/w/api.php?action=query&format=json&prop=imageinfo&iiprop=url&titles=File:Flower.png
|
37
|
+
def request_image( title, options = {} )
|
38
|
+
request( {
|
39
|
+
:action => "query",
|
40
|
+
:prop => "imageinfo",
|
41
|
+
:iiprop => "url",
|
42
|
+
:titles => title
|
43
|
+
}.merge( options ) )
|
44
|
+
end
|
45
|
+
|
46
|
+
def request( options )
|
47
|
+
require 'open-uri'
|
48
|
+
URI.parse( url_for( options ) ).read( "User-Agent" => "Ruby/#{RUBY_VERSION}" )
|
49
|
+
end
|
50
|
+
|
51
|
+
protected
|
52
|
+
def configuration_options
|
53
|
+
{
|
54
|
+
:domain => Configuration[:domain],
|
55
|
+
:path => Configuration[:path]
|
56
|
+
}
|
57
|
+
end
|
58
|
+
|
59
|
+
def url_for( options )
|
60
|
+
url = BASE_URL.dup
|
61
|
+
options = configuration_options.merge( options )
|
62
|
+
options.each do |key, val|
|
63
|
+
value = urlify_value( val )
|
64
|
+
if url.include?( ":#{key}" )
|
65
|
+
url.sub! ":#{key}", value
|
66
|
+
else
|
67
|
+
url << "&#{key}=#{value}"
|
68
|
+
end
|
69
|
+
end
|
70
|
+
url
|
71
|
+
end
|
72
|
+
|
73
|
+
def urlify_value( val )
|
74
|
+
case val
|
75
|
+
when Array
|
76
|
+
encode( val.flatten.join( '|' ) )
|
77
|
+
else
|
78
|
+
encode( val )
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def encode( val )
|
83
|
+
case val
|
84
|
+
when String
|
85
|
+
URI.encode( val ).gsub( '&', '%26' )
|
86
|
+
else
|
87
|
+
val
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'singleton'
|
2
|
+
|
3
|
+
module Wikipedia
|
4
|
+
class Configuration
|
5
|
+
include Singleton
|
6
|
+
|
7
|
+
def self.directives(*directives)
|
8
|
+
directives.each do |directive|
|
9
|
+
define_method directive do |*args|
|
10
|
+
if args.empty?
|
11
|
+
return instance_variable_get("@#{directive}")
|
12
|
+
else
|
13
|
+
instance_variable_set("@#{directive}", args.first)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.[](directive)
|
20
|
+
instance.send(directive)
|
21
|
+
end
|
22
|
+
|
23
|
+
directives :domain, :path
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,109 @@
|
|
1
|
+
module Wikipedia
|
2
|
+
class Page
|
3
|
+
def initialize(json)
|
4
|
+
require 'json'
|
5
|
+
@json = json
|
6
|
+
@data = JSON::load(json)
|
7
|
+
end
|
8
|
+
|
9
|
+
def page
|
10
|
+
@data['query']['pages'].values.first
|
11
|
+
end
|
12
|
+
|
13
|
+
def content
|
14
|
+
page['revisions'].first.values.first if page['revisions']
|
15
|
+
end
|
16
|
+
|
17
|
+
def sanitized_content
|
18
|
+
self.class.sanitize(content)
|
19
|
+
end
|
20
|
+
|
21
|
+
def redirect?
|
22
|
+
content && content.match(/\#REDIRECT\s+\[\[(.*?)\]\]/i)
|
23
|
+
end
|
24
|
+
|
25
|
+
def redirect_title
|
26
|
+
if matches = redirect?
|
27
|
+
matches[1]
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def title
|
32
|
+
page['title']
|
33
|
+
end
|
34
|
+
|
35
|
+
def categories
|
36
|
+
page['categories'].map {|c| c['title'] } if page['categories']
|
37
|
+
end
|
38
|
+
|
39
|
+
def links
|
40
|
+
page['links'].map {|c| c['title'] } if page['links']
|
41
|
+
end
|
42
|
+
|
43
|
+
def images
|
44
|
+
page['images'].map {|c| c['title'] } if page['images']
|
45
|
+
end
|
46
|
+
|
47
|
+
def image_url
|
48
|
+
page['imageinfo'].first['url'] if page['imageinfo']
|
49
|
+
end
|
50
|
+
|
51
|
+
def image_urls
|
52
|
+
if list = images
|
53
|
+
filtered = list.select {|i| i =~ /^file:.+\.(jpg|jpeg|png|gif)$/i && !i.include?("LinkFA-star") }
|
54
|
+
filtered.map do |title|
|
55
|
+
Wikipedia.find_image( title ).image_url
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def raw_data
|
61
|
+
@data
|
62
|
+
end
|
63
|
+
|
64
|
+
def json
|
65
|
+
@json
|
66
|
+
end
|
67
|
+
|
68
|
+
def self.sanitize( s )
|
69
|
+
if s
|
70
|
+
s = s.dup
|
71
|
+
|
72
|
+
# strip anything inside curly braces!
|
73
|
+
while s =~ /\{\{[^\{\}]+?\}\}/
|
74
|
+
s.gsub!(/\{\{[^\{\}]+?\}\}/, '')
|
75
|
+
end
|
76
|
+
|
77
|
+
# strip info box
|
78
|
+
s.sub!(/^\{\|[^\{\}]+?\n\|\}\n/, '')
|
79
|
+
|
80
|
+
# strip internal links
|
81
|
+
s.gsub!(/\[\[([^\]\|]+?)\|([^\]\|]+?)\]\]/, '\2')
|
82
|
+
s.gsub!(/\[\[([^\]\|]+?)\]\]/, '\1')
|
83
|
+
|
84
|
+
# strip images and file links
|
85
|
+
s.gsub!(/\[\[Image:[^\[\]]+?\]\]/, '')
|
86
|
+
s.gsub!(/\[\[File:[^\[\]]+?\]\]/, '')
|
87
|
+
|
88
|
+
# convert bold/italic to html
|
89
|
+
s.gsub!(/'''''(.+?)'''''/, '<b><i>\1</i></b>')
|
90
|
+
s.gsub!(/'''(.+?)'''/, '<b>\1</b>')
|
91
|
+
s.gsub!(/''(.+?)''/, '<i>\1</i>')
|
92
|
+
|
93
|
+
# misc
|
94
|
+
s.gsub!(/<ref[^<>]*>[\s\S]*?<\/ref>/, '')
|
95
|
+
s.gsub!(/<!--[^>]+?-->/, '')
|
96
|
+
s.gsub!(' ', ' ')
|
97
|
+
s.strip!
|
98
|
+
|
99
|
+
# create paragraphs
|
100
|
+
sections = s.split("\n\n")
|
101
|
+
if sections.size > 1
|
102
|
+
s = sections.map {|s| "<p>#{s.strip}</p>" }.join("\n")
|
103
|
+
end
|
104
|
+
|
105
|
+
s
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|