wikipedia-client 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. data/.gitignore +5 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README.textile +81 -0
  4. data/Rakefile +60 -0
  5. data/VERSION +1 -0
  6. data/init.rb +1 -0
  7. data/install.rb +1 -0
  8. data/lib/wikipedia.rb +37 -0
  9. data/lib/wikipedia/client.rb +91 -0
  10. data/lib/wikipedia/configuration.rb +25 -0
  11. data/lib/wikipedia/page.rb +109 -0
  12. data/lib/wikipedia/url.rb +14 -0
  13. data/script/add_sanitization_test +22 -0
  14. data/spec/fixtures/Edsger_Dijkstra.json +1 -0
  15. data/spec/fixtures/Edsger_Dijkstra.yaml +184 -0
  16. data/spec/fixtures/Edsger_Dijkstra_section_0.json +1 -0
  17. data/spec/fixtures/Edsger_content.txt +1 -0
  18. data/spec/fixtures/File_Edsger_Wybe_Dijkstra_jpg.json +1 -0
  19. data/spec/fixtures/sanitization_samples/Ceawlin_of_Wessex-raw.txt +19 -0
  20. data/spec/fixtures/sanitization_samples/Ceawlin_of_Wessex-sanitized.txt +3 -0
  21. data/spec/fixtures/sanitization_samples/Edsger_W_Dijkstra-raw.txt +26 -0
  22. data/spec/fixtures/sanitization_samples/Edsger_W_Dijkstra-sanitized.txt +2 -0
  23. data/spec/fixtures/sanitization_samples/Flower_video_game-raw.txt +25 -0
  24. data/spec/fixtures/sanitization_samples/Flower_video_game-sanitized.txt +2 -0
  25. data/spec/fixtures/sanitization_samples/How_to_Lose_Friends__Alienate_People_film-raw.txt +28 -0
  26. data/spec/fixtures/sanitization_samples/How_to_Lose_Friends__Alienate_People_film-sanitized.txt +2 -0
  27. data/spec/fixtures/sanitization_samples/Kirsten_Dunst-raw.txt +16 -0
  28. data/spec/fixtures/sanitization_samples/Kirsten_Dunst-sanitized.txt +3 -0
  29. data/spec/fixtures/sanitization_samples/Large_Hadron_Collider-raw.txt +104 -0
  30. data/spec/fixtures/sanitization_samples/Large_Hadron_Collider-sanitized.txt +4 -0
  31. data/spec/fixtures/sanitization_samples/Metro_Goldwyn_Mayer-raw.txt +18 -0
  32. data/spec/fixtures/sanitization_samples/Metro_Goldwyn_Mayer-sanitized.txt +1 -0
  33. data/spec/fixtures/sanitization_samples/Middle_Ages-raw.txt +10 -0
  34. data/spec/fixtures/sanitization_samples/Middle_Ages-sanitized.txt +3 -0
  35. data/spec/fixtures/sanitization_samples/SMS_Elbing-raw.txt +51 -0
  36. data/spec/fixtures/sanitization_samples/SMS_Elbing-sanitized.txt +1 -0
  37. data/spec/fixtures/sanitization_samples/Sashimi-raw.txt +16 -0
  38. data/spec/fixtures/sanitization_samples/Sashimi-sanitized.txt +7 -0
  39. data/spec/fixtures/sanitization_samples/Superb_Fairywren-raw.txt +35 -0
  40. data/spec/fixtures/sanitization_samples/Superb_Fairywren-sanitized.txt +3 -0
  41. data/spec/fixtures/sanitization_samples/Velociraptor-raw.txt +28 -0
  42. data/spec/fixtures/sanitization_samples/Velociraptor-sanitized.txt +3 -0
  43. data/spec/lib/client_spec.rb +108 -0
  44. data/spec/lib/sanitize_spec.rb +14 -0
  45. data/spec/lib/url_spec.rb +8 -0
  46. data/spec/lib/wikipedia_spec.rb +20 -0
  47. data/spec/spec_helper.rb +4 -0
  48. data/tasks/wikipedia_tasks.rake +4 -0
  49. data/uninstall.rb +1 -0
  50. data/wikipedia-client.gemspec +96 -0
  51. metadata +134 -0
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *.swp
2
+ *.tmp
3
+ *.log
4
+ pkg/*
5
+
data/MIT-LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2008 [name of plugin creator]
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.textile ADDED
@@ -0,0 +1,81 @@
1
+ h1. Wikipedia
2
+
3
+ Allows you to get wikipedia content through their API. This uses the
4
+ alpha API, not the deprecated query.php API type
5
+
6
+ Wikipedia API reference: "http://en.wikipedia.org/w/api.php":http://en.wikipedia.org/w/api.php
7
+
8
+ Adopted from: "http://code.google.com/p/wikipedia-client/":http://code.google.com/p/wikipedia-client/
9
+
10
+ h2. Examples
11
+
12
+ <pre><code>require 'wikipedia'
13
+ page = Wikipedia.find( 'Getting Things Done' )
14
+
15
+ => #<Wikipedia:Page>
16
+
17
+ page.title
18
+
19
+ => 'Getting Things Done'
20
+
21
+ page.content
22
+
23
+ => # all the wiki markup appears here...
24
+
25
+ page.categories
26
+
27
+ => [..., "Category:Self-help books", ...]
28
+
29
+ page.links
30
+
31
+ => [..., "Business", "Cult following", ...]
32
+
33
+ page.images
34
+
35
+ => ["File:Getting Things Done.jpg", ...]
36
+
37
+ page.image_urls
38
+
39
+ => ["http://upload.wikimedia.org/wikipedia/en/e/e1/Getting_Things_Done.jpg", ...]]</code></pre>
40
+
41
+ h2. Configuration
42
+
43
+ This is by default configured like this:
44
+
45
+ <pre><code>Wikipedia.Configure {
46
+ domain 'en.wikipedia.org'
47
+ path 'w/api.php'
48
+ }</code></pre>
49
+
50
+ h2. Advanced
51
+
52
+ See the API spec at "http://en.wikipedia.org/w/api.php":http://en.wikipedia.org/w/api.php
53
+
54
+ If you need data that is not already present, you can override
55
+ parameters.
56
+
57
+ For example, to retrieve only the page info:
58
+
59
+ <pre><code>page = Wikipedia.find( 'Getting Things Done', :prop => "info" )
60
+
61
+ page.title
62
+
63
+ => "Getting Things Done"
64
+
65
+ page.raw_data
66
+
67
+ => {"query"=>{"pages"=>{"959928"=>{"pageid"=>959928, "ns"=>0,
68
+ "title"=>"Getting Things Done", "touched"=>"2010-03-10T00:04:09Z",
69
+ "lastrevid"=>348481810, "counter"=>0, "length"=>7891}}}}</code></pre>
70
+
71
+ h2. Running specs
72
+
73
+ if you have rspec >= 1.1.3 installed just type in
74
+
75
+ rake spec
76
+
77
+ h2. Thanks!
78
+
79
+ Copyright (c) 2008 [Cyril David], released under the MIT license
80
+
81
+ Adopted by Ken Pratt (ken@kenpratt.net) in 2010/03
data/Rakefile ADDED
@@ -0,0 +1,60 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "wikipedia-client"
8
+ gem.summary = %Q{Ruby client for the Wikipedia API}
9
+ gem.description = %Q{Ruby client for the Wikipedia API}
10
+ gem.email = "christian.hellsten@gmail.com"
11
+ gem.homepage = "http://github.com/christianhellsten/wikipedia-client"
12
+ gem.authors = ["Cyril David", "Ken Pratt"]
13
+ gem.add_development_dependency "thoughtbot-shoulda", ">= 0"
14
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
+ end
16
+ Jeweler::GemcutterTasks.new
17
+ rescue LoadError
18
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
19
+ end
20
+
21
+ require 'rake/testtask'
22
+ Rake::TestTask.new(:test) do |test|
23
+ test.libs << 'lib' << 'test'
24
+ test.pattern = 'test/**/test_*.rb'
25
+ test.verbose = true
26
+ end
27
+
28
+
29
+ desc 'Test the wikipedia plugin.'
30
+ task :spec do
31
+ spec_path = File.expand_path(File.dirname(__FILE__) + '/spec/**/*.rb')
32
+ system("spec -cfs #{spec_path}")
33
+ end
34
+
35
+ begin
36
+ require 'rcov/rcovtask'
37
+ Rcov::RcovTask.new do |test|
38
+ test.libs << 'test'
39
+ test.pattern = 'test/**/test_*.rb'
40
+ test.verbose = true
41
+ end
42
+ rescue LoadError
43
+ task :rcov do
44
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
45
+ end
46
+ end
47
+
48
+ task :spec => :check_dependencies
49
+
50
+ task :default => :spec
51
+
52
+ require 'rake/rdoctask'
53
+ Rake::RDocTask.new do |rdoc|
54
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
55
+
56
+ rdoc.rdoc_dir = 'rdoc'
57
+ rdoc.title = "wikipedia-client #{version}"
58
+ rdoc.rdoc_files.include('README*')
59
+ rdoc.rdoc_files.include('lib/**/*.rb')
60
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 1.0.0
data/init.rb ADDED
@@ -0,0 +1 @@
1
+ require File.dirname(__FILE__) + '/lib/wikipedia'
data/install.rb ADDED
@@ -0,0 +1 @@
1
+ puts File.read(File.dirname(__FILE__) + '/README')
data/lib/wikipedia.rb ADDED
@@ -0,0 +1,37 @@
1
+ Dir[File.dirname(__FILE__) + '/wikipedia/**/*.rb'].each { |f| require f }
2
+
3
+ require 'uri'
4
+
5
+ module Wikipedia
6
+ # Examples :
7
+ # page = Wikipedia.find('Rails')
8
+ # => #<Wikipedia:0x123102>
9
+ # page.content
10
+ # => wiki content appears here
11
+
12
+ # basically just a wrapper for doing
13
+ # client = Wikipedia::Client.new
14
+ # client.find('Rails')
15
+ #
16
+ def self.find( page, options = {} )
17
+ client.find( page, options )
18
+ end
19
+ def self.find_image( title, options = {} )
20
+ client.find_image( title, options )
21
+ end
22
+
23
+ def self.Configure(&block)
24
+ Configuration.instance.instance_eval(&block)
25
+ end
26
+
27
+ Configure {
28
+ domain 'en.wikipedia.org'
29
+ path 'w/api.php'
30
+ }
31
+
32
+ private
33
+
34
+ def self.client
35
+ @client ||= Wikipedia::Client.new
36
+ end
37
+ end
@@ -0,0 +1,91 @@
1
+ module Wikipedia
2
+ class Client
3
+ # see http://en.wikipedia.org/w/api.php
4
+ BASE_URL = "http://:domain/:path?action=:action&format=json"
5
+
6
+ attr_accessor :follow_redirects
7
+
8
+ def initialize
9
+ self.follow_redirects = true
10
+ end
11
+
12
+ def find( title, options = {} )
13
+ title = Url.new(title).title rescue title
14
+ page = Page.new( request_page( title, options ) )
15
+ while follow_redirects and page.redirect?
16
+ page = Page.new( request_page( page.redirect_title, options ))
17
+ end
18
+ page
19
+ end
20
+
21
+ def find_image( title, options = {} )
22
+ title = Url.new(title).title rescue title
23
+ Page.new( request_image( title, options ) )
24
+ end
25
+
26
+ # http://en.wikipedia.org/w/api.php?action=query&format=json&prop=revisions%7Clinks%7Cimages%7Ccategories&rvprop=content&titles=Flower%20(video%20game)
27
+ def request_page( title, options = {} )
28
+ request( {
29
+ :action => "query",
30
+ :prop => %w{ revisions links images categories },
31
+ :rvprop => "content",
32
+ :titles => title
33
+ }.merge( options ) )
34
+ end
35
+
36
+ # http://en.wikipedia.org/w/api.php?action=query&format=json&prop=imageinfo&iiprop=url&titles=File:Flower.png
37
+ def request_image( title, options = {} )
38
+ request( {
39
+ :action => "query",
40
+ :prop => "imageinfo",
41
+ :iiprop => "url",
42
+ :titles => title
43
+ }.merge( options ) )
44
+ end
45
+
46
+ def request( options )
47
+ require 'open-uri'
48
+ URI.parse( url_for( options ) ).read( "User-Agent" => "Ruby/#{RUBY_VERSION}" )
49
+ end
50
+
51
+ protected
52
+ def configuration_options
53
+ {
54
+ :domain => Configuration[:domain],
55
+ :path => Configuration[:path]
56
+ }
57
+ end
58
+
59
+ def url_for( options )
60
+ url = BASE_URL.dup
61
+ options = configuration_options.merge( options )
62
+ options.each do |key, val|
63
+ value = urlify_value( val )
64
+ if url.include?( ":#{key}" )
65
+ url.sub! ":#{key}", value
66
+ else
67
+ url << "&#{key}=#{value}"
68
+ end
69
+ end
70
+ url
71
+ end
72
+
73
+ def urlify_value( val )
74
+ case val
75
+ when Array
76
+ encode( val.flatten.join( '|' ) )
77
+ else
78
+ encode( val )
79
+ end
80
+ end
81
+
82
+ def encode( val )
83
+ case val
84
+ when String
85
+ URI.encode( val ).gsub( '&', '%26' )
86
+ else
87
+ val
88
+ end
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,25 @@
1
+ require 'singleton'
2
+
3
+ module Wikipedia
4
+ class Configuration
5
+ include Singleton
6
+
7
+ def self.directives(*directives)
8
+ directives.each do |directive|
9
+ define_method directive do |*args|
10
+ if args.empty?
11
+ return instance_variable_get("@#{directive}")
12
+ else
13
+ instance_variable_set("@#{directive}", args.first)
14
+ end
15
+ end
16
+ end
17
+ end
18
+
19
+ def self.[](directive)
20
+ instance.send(directive)
21
+ end
22
+
23
+ directives :domain, :path
24
+ end
25
+ end
@@ -0,0 +1,109 @@
1
+ module Wikipedia
2
+ class Page
3
+ def initialize(json)
4
+ require 'json'
5
+ @json = json
6
+ @data = JSON::load(json)
7
+ end
8
+
9
+ def page
10
+ @data['query']['pages'].values.first
11
+ end
12
+
13
+ def content
14
+ page['revisions'].first.values.first if page['revisions']
15
+ end
16
+
17
+ def sanitized_content
18
+ self.class.sanitize(content)
19
+ end
20
+
21
+ def redirect?
22
+ content && content.match(/\#REDIRECT\s+\[\[(.*?)\]\]/i)
23
+ end
24
+
25
+ def redirect_title
26
+ if matches = redirect?
27
+ matches[1]
28
+ end
29
+ end
30
+
31
+ def title
32
+ page['title']
33
+ end
34
+
35
+ def categories
36
+ page['categories'].map {|c| c['title'] } if page['categories']
37
+ end
38
+
39
+ def links
40
+ page['links'].map {|c| c['title'] } if page['links']
41
+ end
42
+
43
+ def images
44
+ page['images'].map {|c| c['title'] } if page['images']
45
+ end
46
+
47
+ def image_url
48
+ page['imageinfo'].first['url'] if page['imageinfo']
49
+ end
50
+
51
+ def image_urls
52
+ if list = images
53
+ filtered = list.select {|i| i =~ /^file:.+\.(jpg|jpeg|png|gif)$/i && !i.include?("LinkFA-star") }
54
+ filtered.map do |title|
55
+ Wikipedia.find_image( title ).image_url
56
+ end
57
+ end
58
+ end
59
+
60
+ def raw_data
61
+ @data
62
+ end
63
+
64
+ def json
65
+ @json
66
+ end
67
+
68
+ def self.sanitize( s )
69
+ if s
70
+ s = s.dup
71
+
72
+ # strip anything inside curly braces!
73
+ while s =~ /\{\{[^\{\}]+?\}\}/
74
+ s.gsub!(/\{\{[^\{\}]+?\}\}/, '')
75
+ end
76
+
77
+ # strip info box
78
+ s.sub!(/^\{\|[^\{\}]+?\n\|\}\n/, '')
79
+
80
+ # strip internal links
81
+ s.gsub!(/\[\[([^\]\|]+?)\|([^\]\|]+?)\]\]/, '\2')
82
+ s.gsub!(/\[\[([^\]\|]+?)\]\]/, '\1')
83
+
84
+ # strip images and file links
85
+ s.gsub!(/\[\[Image:[^\[\]]+?\]\]/, '')
86
+ s.gsub!(/\[\[File:[^\[\]]+?\]\]/, '')
87
+
88
+ # convert bold/italic to html
89
+ s.gsub!(/'''''(.+?)'''''/, '<b><i>\1</i></b>')
90
+ s.gsub!(/'''(.+?)'''/, '<b>\1</b>')
91
+ s.gsub!(/''(.+?)''/, '<i>\1</i>')
92
+
93
+ # misc
94
+ s.gsub!(/<ref[^<>]*>[\s\S]*?<\/ref>/, '')
95
+ s.gsub!(/<!--[^>]+?-->/, '')
96
+ s.gsub!(' ', ' ')
97
+ s.strip!
98
+
99
+ # create paragraphs
100
+ sections = s.split("\n\n")
101
+ if sections.size > 1
102
+ s = sections.map {|s| "<p>#{s.strip}</p>" }.join("\n")
103
+ end
104
+
105
+ s
106
+ end
107
+ end
108
+ end
109
+ end