wikipedia-client 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +5 -0
- data/MIT-LICENSE +20 -0
- data/README.textile +81 -0
- data/Rakefile +60 -0
- data/VERSION +1 -0
- data/init.rb +1 -0
- data/install.rb +1 -0
- data/lib/wikipedia.rb +37 -0
- data/lib/wikipedia/client.rb +91 -0
- data/lib/wikipedia/configuration.rb +25 -0
- data/lib/wikipedia/page.rb +109 -0
- data/lib/wikipedia/url.rb +14 -0
- data/script/add_sanitization_test +22 -0
- data/spec/fixtures/Edsger_Dijkstra.json +1 -0
- data/spec/fixtures/Edsger_Dijkstra.yaml +184 -0
- data/spec/fixtures/Edsger_Dijkstra_section_0.json +1 -0
- data/spec/fixtures/Edsger_content.txt +1 -0
- data/spec/fixtures/File_Edsger_Wybe_Dijkstra_jpg.json +1 -0
- data/spec/fixtures/sanitization_samples/Ceawlin_of_Wessex-raw.txt +19 -0
- data/spec/fixtures/sanitization_samples/Ceawlin_of_Wessex-sanitized.txt +3 -0
- data/spec/fixtures/sanitization_samples/Edsger_W_Dijkstra-raw.txt +26 -0
- data/spec/fixtures/sanitization_samples/Edsger_W_Dijkstra-sanitized.txt +2 -0
- data/spec/fixtures/sanitization_samples/Flower_video_game-raw.txt +25 -0
- data/spec/fixtures/sanitization_samples/Flower_video_game-sanitized.txt +2 -0
- data/spec/fixtures/sanitization_samples/How_to_Lose_Friends__Alienate_People_film-raw.txt +28 -0
- data/spec/fixtures/sanitization_samples/How_to_Lose_Friends__Alienate_People_film-sanitized.txt +2 -0
- data/spec/fixtures/sanitization_samples/Kirsten_Dunst-raw.txt +16 -0
- data/spec/fixtures/sanitization_samples/Kirsten_Dunst-sanitized.txt +3 -0
- data/spec/fixtures/sanitization_samples/Large_Hadron_Collider-raw.txt +104 -0
- data/spec/fixtures/sanitization_samples/Large_Hadron_Collider-sanitized.txt +4 -0
- data/spec/fixtures/sanitization_samples/Metro_Goldwyn_Mayer-raw.txt +18 -0
- data/spec/fixtures/sanitization_samples/Metro_Goldwyn_Mayer-sanitized.txt +1 -0
- data/spec/fixtures/sanitization_samples/Middle_Ages-raw.txt +10 -0
- data/spec/fixtures/sanitization_samples/Middle_Ages-sanitized.txt +3 -0
- data/spec/fixtures/sanitization_samples/SMS_Elbing-raw.txt +51 -0
- data/spec/fixtures/sanitization_samples/SMS_Elbing-sanitized.txt +1 -0
- data/spec/fixtures/sanitization_samples/Sashimi-raw.txt +16 -0
- data/spec/fixtures/sanitization_samples/Sashimi-sanitized.txt +7 -0
- data/spec/fixtures/sanitization_samples/Superb_Fairywren-raw.txt +35 -0
- data/spec/fixtures/sanitization_samples/Superb_Fairywren-sanitized.txt +3 -0
- data/spec/fixtures/sanitization_samples/Velociraptor-raw.txt +28 -0
- data/spec/fixtures/sanitization_samples/Velociraptor-sanitized.txt +3 -0
- data/spec/lib/client_spec.rb +108 -0
- data/spec/lib/sanitize_spec.rb +14 -0
- data/spec/lib/url_spec.rb +8 -0
- data/spec/lib/wikipedia_spec.rb +20 -0
- data/spec/spec_helper.rb +4 -0
- data/tasks/wikipedia_tasks.rake +4 -0
- data/uninstall.rb +1 -0
- data/wikipedia-client.gemspec +96 -0
- metadata +134 -0
data/MIT-LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2008 [name of plugin creator]
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.textile
ADDED
@@ -0,0 +1,81 @@
|
|
1
|
+
h1. Wikipedia
|
2
|
+
|
3
|
+
Allows you to get wikipedia content through their API. This uses the
|
4
|
+
alpha API, not the deprecated query.php API type
|
5
|
+
|
6
|
+
Wikipedia API reference: "http://en.wikipedia.org/w/api.php":http://en.wikipedia.org/w/api.php
|
7
|
+
|
8
|
+
Adopted from: "http://code.google.com/p/wikipedia-client/":http://code.google.com/p/wikipedia-client/
|
9
|
+
|
10
|
+
h2. Examples
|
11
|
+
|
12
|
+
<pre><code>require 'wikipedia'
|
13
|
+
page = Wikipedia.find( 'Getting Things Done' )
|
14
|
+
|
15
|
+
=> #<Wikipedia:Page>
|
16
|
+
|
17
|
+
page.title
|
18
|
+
|
19
|
+
=> 'Getting Things Done'
|
20
|
+
|
21
|
+
page.content
|
22
|
+
|
23
|
+
=> # all the wiki markup appears here...
|
24
|
+
|
25
|
+
page.categories
|
26
|
+
|
27
|
+
=> [..., "Category:Self-help books", ...]
|
28
|
+
|
29
|
+
page.links
|
30
|
+
|
31
|
+
=> [..., "Business", "Cult following", ...]
|
32
|
+
|
33
|
+
page.images
|
34
|
+
|
35
|
+
=> ["File:Getting Things Done.jpg", ...]
|
36
|
+
|
37
|
+
page.image_urls
|
38
|
+
|
39
|
+
=> ["http://upload.wikimedia.org/wikipedia/en/e/e1/Getting_Things_Done.jpg", ...]]</code></pre>
|
40
|
+
|
41
|
+
h2. Configuration
|
42
|
+
|
43
|
+
This is by default configured like this:
|
44
|
+
|
45
|
+
<pre><code>Wikipedia.Configure {
|
46
|
+
domain 'en.wikipedia.org'
|
47
|
+
path 'w/api.php'
|
48
|
+
}</code></pre>
|
49
|
+
|
50
|
+
h2. Advanced
|
51
|
+
|
52
|
+
See the API spec at "http://en.wikipedia.org/w/api.php":http://en.wikipedia.org/w/api.php
|
53
|
+
|
54
|
+
If you need data that is not already present, you can override
|
55
|
+
parameters.
|
56
|
+
|
57
|
+
For example, to retrieve only the page info:
|
58
|
+
|
59
|
+
<pre><code>page = Wikipedia.find( 'Getting Things Done', :prop => "info" )
|
60
|
+
|
61
|
+
page.title
|
62
|
+
|
63
|
+
=> "Getting Things Done"
|
64
|
+
|
65
|
+
page.raw_data
|
66
|
+
|
67
|
+
=> {"query"=>{"pages"=>{"959928"=>{"pageid"=>959928, "ns"=>0,
|
68
|
+
"title"=>"Getting Things Done", "touched"=>"2010-03-10T00:04:09Z",
|
69
|
+
"lastrevid"=>348481810, "counter"=>0, "length"=>7891}}}}</code></pre>
|
70
|
+
|
71
|
+
h2. Running specs
|
72
|
+
|
73
|
+
if you have rspec >= 1.1.3 installed just type in
|
74
|
+
|
75
|
+
rake spec
|
76
|
+
|
77
|
+
h2. Thanks!
|
78
|
+
|
79
|
+
Copyright (c) 2008 [Cyril David], released under the MIT license
|
80
|
+
|
81
|
+
Adopted by Ken Pratt (ken@kenpratt.net) in 2010/03
|
data/Rakefile
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "wikipedia-client"
|
8
|
+
gem.summary = %Q{Ruby client for the Wikipedia API}
|
9
|
+
gem.description = %Q{Ruby client for the Wikipedia API}
|
10
|
+
gem.email = "christian.hellsten@gmail.com"
|
11
|
+
gem.homepage = "http://github.com/christianhellsten/wikipedia-client"
|
12
|
+
gem.authors = ["Cyril David", "Ken Pratt"]
|
13
|
+
gem.add_development_dependency "thoughtbot-shoulda", ">= 0"
|
14
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
15
|
+
end
|
16
|
+
Jeweler::GemcutterTasks.new
|
17
|
+
rescue LoadError
|
18
|
+
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
19
|
+
end
|
20
|
+
|
21
|
+
require 'rake/testtask'
|
22
|
+
Rake::TestTask.new(:test) do |test|
|
23
|
+
test.libs << 'lib' << 'test'
|
24
|
+
test.pattern = 'test/**/test_*.rb'
|
25
|
+
test.verbose = true
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
desc 'Test the wikipedia plugin.'
|
30
|
+
task :spec do
|
31
|
+
spec_path = File.expand_path(File.dirname(__FILE__) + '/spec/**/*.rb')
|
32
|
+
system("spec -cfs #{spec_path}")
|
33
|
+
end
|
34
|
+
|
35
|
+
begin
|
36
|
+
require 'rcov/rcovtask'
|
37
|
+
Rcov::RcovTask.new do |test|
|
38
|
+
test.libs << 'test'
|
39
|
+
test.pattern = 'test/**/test_*.rb'
|
40
|
+
test.verbose = true
|
41
|
+
end
|
42
|
+
rescue LoadError
|
43
|
+
task :rcov do
|
44
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
task :spec => :check_dependencies
|
49
|
+
|
50
|
+
task :default => :spec
|
51
|
+
|
52
|
+
require 'rake/rdoctask'
|
53
|
+
Rake::RDocTask.new do |rdoc|
|
54
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
55
|
+
|
56
|
+
rdoc.rdoc_dir = 'rdoc'
|
57
|
+
rdoc.title = "wikipedia-client #{version}"
|
58
|
+
rdoc.rdoc_files.include('README*')
|
59
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
60
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
1.0.0
|
data/init.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/lib/wikipedia'
|
data/install.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
puts File.read(File.dirname(__FILE__) + '/README')
|
data/lib/wikipedia.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
Dir[File.dirname(__FILE__) + '/wikipedia/**/*.rb'].each { |f| require f }
|
2
|
+
|
3
|
+
require 'uri'
|
4
|
+
|
5
|
+
module Wikipedia
|
6
|
+
# Examples :
|
7
|
+
# page = Wikipedia.find('Rails')
|
8
|
+
# => #<Wikipedia:0x123102>
|
9
|
+
# page.content
|
10
|
+
# => wiki content appears here
|
11
|
+
|
12
|
+
# basically just a wrapper for doing
|
13
|
+
# client = Wikipedia::Client.new
|
14
|
+
# client.find('Rails')
|
15
|
+
#
|
16
|
+
def self.find( page, options = {} )
|
17
|
+
client.find( page, options )
|
18
|
+
end
|
19
|
+
def self.find_image( title, options = {} )
|
20
|
+
client.find_image( title, options )
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.Configure(&block)
|
24
|
+
Configuration.instance.instance_eval(&block)
|
25
|
+
end
|
26
|
+
|
27
|
+
Configure {
|
28
|
+
domain 'en.wikipedia.org'
|
29
|
+
path 'w/api.php'
|
30
|
+
}
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def self.client
|
35
|
+
@client ||= Wikipedia::Client.new
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
module Wikipedia
|
2
|
+
class Client
|
3
|
+
# see http://en.wikipedia.org/w/api.php
|
4
|
+
BASE_URL = "http://:domain/:path?action=:action&format=json"
|
5
|
+
|
6
|
+
attr_accessor :follow_redirects
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
self.follow_redirects = true
|
10
|
+
end
|
11
|
+
|
12
|
+
def find( title, options = {} )
|
13
|
+
title = Url.new(title).title rescue title
|
14
|
+
page = Page.new( request_page( title, options ) )
|
15
|
+
while follow_redirects and page.redirect?
|
16
|
+
page = Page.new( request_page( page.redirect_title, options ))
|
17
|
+
end
|
18
|
+
page
|
19
|
+
end
|
20
|
+
|
21
|
+
def find_image( title, options = {} )
|
22
|
+
title = Url.new(title).title rescue title
|
23
|
+
Page.new( request_image( title, options ) )
|
24
|
+
end
|
25
|
+
|
26
|
+
# http://en.wikipedia.org/w/api.php?action=query&format=json&prop=revisions%7Clinks%7Cimages%7Ccategories&rvprop=content&titles=Flower%20(video%20game)
|
27
|
+
def request_page( title, options = {} )
|
28
|
+
request( {
|
29
|
+
:action => "query",
|
30
|
+
:prop => %w{ revisions links images categories },
|
31
|
+
:rvprop => "content",
|
32
|
+
:titles => title
|
33
|
+
}.merge( options ) )
|
34
|
+
end
|
35
|
+
|
36
|
+
# http://en.wikipedia.org/w/api.php?action=query&format=json&prop=imageinfo&iiprop=url&titles=File:Flower.png
|
37
|
+
def request_image( title, options = {} )
|
38
|
+
request( {
|
39
|
+
:action => "query",
|
40
|
+
:prop => "imageinfo",
|
41
|
+
:iiprop => "url",
|
42
|
+
:titles => title
|
43
|
+
}.merge( options ) )
|
44
|
+
end
|
45
|
+
|
46
|
+
def request( options )
|
47
|
+
require 'open-uri'
|
48
|
+
URI.parse( url_for( options ) ).read( "User-Agent" => "Ruby/#{RUBY_VERSION}" )
|
49
|
+
end
|
50
|
+
|
51
|
+
protected
|
52
|
+
def configuration_options
|
53
|
+
{
|
54
|
+
:domain => Configuration[:domain],
|
55
|
+
:path => Configuration[:path]
|
56
|
+
}
|
57
|
+
end
|
58
|
+
|
59
|
+
def url_for( options )
|
60
|
+
url = BASE_URL.dup
|
61
|
+
options = configuration_options.merge( options )
|
62
|
+
options.each do |key, val|
|
63
|
+
value = urlify_value( val )
|
64
|
+
if url.include?( ":#{key}" )
|
65
|
+
url.sub! ":#{key}", value
|
66
|
+
else
|
67
|
+
url << "&#{key}=#{value}"
|
68
|
+
end
|
69
|
+
end
|
70
|
+
url
|
71
|
+
end
|
72
|
+
|
73
|
+
def urlify_value( val )
|
74
|
+
case val
|
75
|
+
when Array
|
76
|
+
encode( val.flatten.join( '|' ) )
|
77
|
+
else
|
78
|
+
encode( val )
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def encode( val )
|
83
|
+
case val
|
84
|
+
when String
|
85
|
+
URI.encode( val ).gsub( '&', '%26' )
|
86
|
+
else
|
87
|
+
val
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'singleton'
|
2
|
+
|
3
|
+
module Wikipedia
|
4
|
+
class Configuration
|
5
|
+
include Singleton
|
6
|
+
|
7
|
+
def self.directives(*directives)
|
8
|
+
directives.each do |directive|
|
9
|
+
define_method directive do |*args|
|
10
|
+
if args.empty?
|
11
|
+
return instance_variable_get("@#{directive}")
|
12
|
+
else
|
13
|
+
instance_variable_set("@#{directive}", args.first)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.[](directive)
|
20
|
+
instance.send(directive)
|
21
|
+
end
|
22
|
+
|
23
|
+
directives :domain, :path
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,109 @@
|
|
1
|
+
module Wikipedia
|
2
|
+
class Page
|
3
|
+
def initialize(json)
|
4
|
+
require 'json'
|
5
|
+
@json = json
|
6
|
+
@data = JSON::load(json)
|
7
|
+
end
|
8
|
+
|
9
|
+
def page
|
10
|
+
@data['query']['pages'].values.first
|
11
|
+
end
|
12
|
+
|
13
|
+
def content
|
14
|
+
page['revisions'].first.values.first if page['revisions']
|
15
|
+
end
|
16
|
+
|
17
|
+
def sanitized_content
|
18
|
+
self.class.sanitize(content)
|
19
|
+
end
|
20
|
+
|
21
|
+
def redirect?
|
22
|
+
content && content.match(/\#REDIRECT\s+\[\[(.*?)\]\]/i)
|
23
|
+
end
|
24
|
+
|
25
|
+
def redirect_title
|
26
|
+
if matches = redirect?
|
27
|
+
matches[1]
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def title
|
32
|
+
page['title']
|
33
|
+
end
|
34
|
+
|
35
|
+
def categories
|
36
|
+
page['categories'].map {|c| c['title'] } if page['categories']
|
37
|
+
end
|
38
|
+
|
39
|
+
def links
|
40
|
+
page['links'].map {|c| c['title'] } if page['links']
|
41
|
+
end
|
42
|
+
|
43
|
+
def images
|
44
|
+
page['images'].map {|c| c['title'] } if page['images']
|
45
|
+
end
|
46
|
+
|
47
|
+
def image_url
|
48
|
+
page['imageinfo'].first['url'] if page['imageinfo']
|
49
|
+
end
|
50
|
+
|
51
|
+
def image_urls
|
52
|
+
if list = images
|
53
|
+
filtered = list.select {|i| i =~ /^file:.+\.(jpg|jpeg|png|gif)$/i && !i.include?("LinkFA-star") }
|
54
|
+
filtered.map do |title|
|
55
|
+
Wikipedia.find_image( title ).image_url
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def raw_data
|
61
|
+
@data
|
62
|
+
end
|
63
|
+
|
64
|
+
def json
|
65
|
+
@json
|
66
|
+
end
|
67
|
+
|
68
|
+
def self.sanitize( s )
|
69
|
+
if s
|
70
|
+
s = s.dup
|
71
|
+
|
72
|
+
# strip anything inside curly braces!
|
73
|
+
while s =~ /\{\{[^\{\}]+?\}\}/
|
74
|
+
s.gsub!(/\{\{[^\{\}]+?\}\}/, '')
|
75
|
+
end
|
76
|
+
|
77
|
+
# strip info box
|
78
|
+
s.sub!(/^\{\|[^\{\}]+?\n\|\}\n/, '')
|
79
|
+
|
80
|
+
# strip internal links
|
81
|
+
s.gsub!(/\[\[([^\]\|]+?)\|([^\]\|]+?)\]\]/, '\2')
|
82
|
+
s.gsub!(/\[\[([^\]\|]+?)\]\]/, '\1')
|
83
|
+
|
84
|
+
# strip images and file links
|
85
|
+
s.gsub!(/\[\[Image:[^\[\]]+?\]\]/, '')
|
86
|
+
s.gsub!(/\[\[File:[^\[\]]+?\]\]/, '')
|
87
|
+
|
88
|
+
# convert bold/italic to html
|
89
|
+
s.gsub!(/'''''(.+?)'''''/, '<b><i>\1</i></b>')
|
90
|
+
s.gsub!(/'''(.+?)'''/, '<b>\1</b>')
|
91
|
+
s.gsub!(/''(.+?)''/, '<i>\1</i>')
|
92
|
+
|
93
|
+
# misc
|
94
|
+
s.gsub!(/<ref[^<>]*>[\s\S]*?<\/ref>/, '')
|
95
|
+
s.gsub!(/<!--[^>]+?-->/, '')
|
96
|
+
s.gsub!(' ', ' ')
|
97
|
+
s.strip!
|
98
|
+
|
99
|
+
# create paragraphs
|
100
|
+
sections = s.split("\n\n")
|
101
|
+
if sections.size > 1
|
102
|
+
s = sections.map {|s| "<p>#{s.strip}</p>" }.join("\n")
|
103
|
+
end
|
104
|
+
|
105
|
+
s
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|