wikipedia-client 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. data/.gitignore +5 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README.textile +81 -0
  4. data/Rakefile +60 -0
  5. data/VERSION +1 -0
  6. data/init.rb +1 -0
  7. data/install.rb +1 -0
  8. data/lib/wikipedia.rb +37 -0
  9. data/lib/wikipedia/client.rb +91 -0
  10. data/lib/wikipedia/configuration.rb +25 -0
  11. data/lib/wikipedia/page.rb +109 -0
  12. data/lib/wikipedia/url.rb +14 -0
  13. data/script/add_sanitization_test +22 -0
  14. data/spec/fixtures/Edsger_Dijkstra.json +1 -0
  15. data/spec/fixtures/Edsger_Dijkstra.yaml +184 -0
  16. data/spec/fixtures/Edsger_Dijkstra_section_0.json +1 -0
  17. data/spec/fixtures/Edsger_content.txt +1 -0
  18. data/spec/fixtures/File_Edsger_Wybe_Dijkstra_jpg.json +1 -0
  19. data/spec/fixtures/sanitization_samples/Ceawlin_of_Wessex-raw.txt +19 -0
  20. data/spec/fixtures/sanitization_samples/Ceawlin_of_Wessex-sanitized.txt +3 -0
  21. data/spec/fixtures/sanitization_samples/Edsger_W_Dijkstra-raw.txt +26 -0
  22. data/spec/fixtures/sanitization_samples/Edsger_W_Dijkstra-sanitized.txt +2 -0
  23. data/spec/fixtures/sanitization_samples/Flower_video_game-raw.txt +25 -0
  24. data/spec/fixtures/sanitization_samples/Flower_video_game-sanitized.txt +2 -0
  25. data/spec/fixtures/sanitization_samples/How_to_Lose_Friends__Alienate_People_film-raw.txt +28 -0
  26. data/spec/fixtures/sanitization_samples/How_to_Lose_Friends__Alienate_People_film-sanitized.txt +2 -0
  27. data/spec/fixtures/sanitization_samples/Kirsten_Dunst-raw.txt +16 -0
  28. data/spec/fixtures/sanitization_samples/Kirsten_Dunst-sanitized.txt +3 -0
  29. data/spec/fixtures/sanitization_samples/Large_Hadron_Collider-raw.txt +104 -0
  30. data/spec/fixtures/sanitization_samples/Large_Hadron_Collider-sanitized.txt +4 -0
  31. data/spec/fixtures/sanitization_samples/Metro_Goldwyn_Mayer-raw.txt +18 -0
  32. data/spec/fixtures/sanitization_samples/Metro_Goldwyn_Mayer-sanitized.txt +1 -0
  33. data/spec/fixtures/sanitization_samples/Middle_Ages-raw.txt +10 -0
  34. data/spec/fixtures/sanitization_samples/Middle_Ages-sanitized.txt +3 -0
  35. data/spec/fixtures/sanitization_samples/SMS_Elbing-raw.txt +51 -0
  36. data/spec/fixtures/sanitization_samples/SMS_Elbing-sanitized.txt +1 -0
  37. data/spec/fixtures/sanitization_samples/Sashimi-raw.txt +16 -0
  38. data/spec/fixtures/sanitization_samples/Sashimi-sanitized.txt +7 -0
  39. data/spec/fixtures/sanitization_samples/Superb_Fairywren-raw.txt +35 -0
  40. data/spec/fixtures/sanitization_samples/Superb_Fairywren-sanitized.txt +3 -0
  41. data/spec/fixtures/sanitization_samples/Velociraptor-raw.txt +28 -0
  42. data/spec/fixtures/sanitization_samples/Velociraptor-sanitized.txt +3 -0
  43. data/spec/lib/client_spec.rb +108 -0
  44. data/spec/lib/sanitize_spec.rb +14 -0
  45. data/spec/lib/url_spec.rb +8 -0
  46. data/spec/lib/wikipedia_spec.rb +20 -0
  47. data/spec/spec_helper.rb +4 -0
  48. data/tasks/wikipedia_tasks.rake +4 -0
  49. data/uninstall.rb +1 -0
  50. data/wikipedia-client.gemspec +96 -0
  51. metadata +134 -0
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *.swp
2
+ *.tmp
3
+ *.log
4
+ pkg/*
5
+
data/MIT-LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2008 [name of plugin creator]
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.textile ADDED
@@ -0,0 +1,81 @@
1
+ h1. Wikipedia
2
+
3
+ Allows you to get wikipedia content through their API. This uses the
4
+ alpha API, not the deprecated query.php API type
5
+
6
+ Wikipedia API reference: "http://en.wikipedia.org/w/api.php":http://en.wikipedia.org/w/api.php
7
+
8
+ Adopted from: "http://code.google.com/p/wikipedia-client/":http://code.google.com/p/wikipedia-client/
9
+
10
+ h2. Examples
11
+
12
+ <pre><code>require 'wikipedia'
13
+ page = Wikipedia.find( 'Getting Things Done' )
14
+
15
+ => #<Wikipedia:Page>
16
+
17
+ page.title
18
+
19
+ => 'Getting Things Done'
20
+
21
+ page.content
22
+
23
+ => # all the wiki markup appears here...
24
+
25
+ page.categories
26
+
27
+ => [..., "Category:Self-help books", ...]
28
+
29
+ page.links
30
+
31
+ => [..., "Business", "Cult following", ...]
32
+
33
+ page.images
34
+
35
+ => ["File:Getting Things Done.jpg", ...]
36
+
37
+ page.image_urls
38
+
39
+ => ["http://upload.wikimedia.org/wikipedia/en/e/e1/Getting_Things_Done.jpg", ...]]</code></pre>
40
+
41
+ h2. Configuration
42
+
43
+ This is by default configured like this:
44
+
45
+ <pre><code>Wikipedia.Configure {
46
+ domain 'en.wikipedia.org'
47
+ path 'w/api.php'
48
+ }</code></pre>
49
+
50
+ h2. Advanced
51
+
52
+ See the API spec at "http://en.wikipedia.org/w/api.php":http://en.wikipedia.org/w/api.php
53
+
54
+ If you need data that is not already present, you can override
55
+ parameters.
56
+
57
+ For example, to retrieve only the page info:
58
+
59
+ <pre><code>page = Wikipedia.find( 'Getting Things Done', :prop => "info" )
60
+
61
+ page.title
62
+
63
+ => "Getting Things Done"
64
+
65
+ page.raw_data
66
+
67
+ => {"query"=>{"pages"=>{"959928"=>{"pageid"=>959928, "ns"=>0,
68
+ "title"=>"Getting Things Done", "touched"=>"2010-03-10T00:04:09Z",
69
+ "lastrevid"=>348481810, "counter"=>0, "length"=>7891}}}}</code></pre>
70
+
71
+ h2. Running specs
72
+
73
+ if you have rspec >= 1.1.3 installed just type in
74
+
75
+ rake spec
76
+
77
+ h2. Thanks!
78
+
79
+ Copyright (c) 2008 [Cyril David], released under the MIT license
80
+
81
+ Adopted by Ken Pratt (ken@kenpratt.net) in 2010/03
data/Rakefile ADDED
@@ -0,0 +1,60 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "wikipedia-client"
8
+ gem.summary = %Q{Ruby client for the Wikipedia API}
9
+ gem.description = %Q{Ruby client for the Wikipedia API}
10
+ gem.email = "christian.hellsten@gmail.com"
11
+ gem.homepage = "http://github.com/christianhellsten/wikipedia-client"
12
+ gem.authors = ["Cyril David", "Ken Pratt"]
13
+ gem.add_development_dependency "thoughtbot-shoulda", ">= 0"
14
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
+ end
16
+ Jeweler::GemcutterTasks.new
17
+ rescue LoadError
18
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
19
+ end
20
+
21
+ require 'rake/testtask'
22
+ Rake::TestTask.new(:test) do |test|
23
+ test.libs << 'lib' << 'test'
24
+ test.pattern = 'test/**/test_*.rb'
25
+ test.verbose = true
26
+ end
27
+
28
+
29
+ desc 'Test the wikipedia plugin.'
30
+ task :spec do
31
+ spec_path = File.expand_path(File.dirname(__FILE__) + '/spec/**/*.rb')
32
+ system("spec -cfs #{spec_path}")
33
+ end
34
+
35
+ begin
36
+ require 'rcov/rcovtask'
37
+ Rcov::RcovTask.new do |test|
38
+ test.libs << 'test'
39
+ test.pattern = 'test/**/test_*.rb'
40
+ test.verbose = true
41
+ end
42
+ rescue LoadError
43
+ task :rcov do
44
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
45
+ end
46
+ end
47
+
48
+ task :spec => :check_dependencies
49
+
50
+ task :default => :spec
51
+
52
+ require 'rake/rdoctask'
53
+ Rake::RDocTask.new do |rdoc|
54
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
55
+
56
+ rdoc.rdoc_dir = 'rdoc'
57
+ rdoc.title = "wikipedia-client #{version}"
58
+ rdoc.rdoc_files.include('README*')
59
+ rdoc.rdoc_files.include('lib/**/*.rb')
60
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 1.0.0
data/init.rb ADDED
@@ -0,0 +1 @@
1
+ require File.dirname(__FILE__) + '/lib/wikipedia'
data/install.rb ADDED
@@ -0,0 +1 @@
1
+ puts File.read(File.dirname(__FILE__) + '/README')
data/lib/wikipedia.rb ADDED
@@ -0,0 +1,37 @@
1
+ Dir[File.dirname(__FILE__) + '/wikipedia/**/*.rb'].each { |f| require f }
2
+
3
+ require 'uri'
4
+
5
+ module Wikipedia
6
+ # Examples :
7
+ # page = Wikipedia.find('Rails')
8
+ # => #<Wikipedia:0x123102>
9
+ # page.content
10
+ # => wiki content appears here
11
+
12
+ # basically just a wrapper for doing
13
+ # client = Wikipedia::Client.new
14
+ # client.find('Rails')
15
+ #
16
+ def self.find( page, options = {} )
17
+ client.find( page, options )
18
+ end
19
+ def self.find_image( title, options = {} )
20
+ client.find_image( title, options )
21
+ end
22
+
23
+ def self.Configure(&block)
24
+ Configuration.instance.instance_eval(&block)
25
+ end
26
+
27
+ Configure {
28
+ domain 'en.wikipedia.org'
29
+ path 'w/api.php'
30
+ }
31
+
32
+ private
33
+
34
+ def self.client
35
+ @client ||= Wikipedia::Client.new
36
+ end
37
+ end
@@ -0,0 +1,91 @@
1
+ module Wikipedia
2
+ class Client
3
+ # see http://en.wikipedia.org/w/api.php
4
+ BASE_URL = "http://:domain/:path?action=:action&format=json"
5
+
6
+ attr_accessor :follow_redirects
7
+
8
+ def initialize
9
+ self.follow_redirects = true
10
+ end
11
+
12
+ def find( title, options = {} )
13
+ title = Url.new(title).title rescue title
14
+ page = Page.new( request_page( title, options ) )
15
+ while follow_redirects and page.redirect?
16
+ page = Page.new( request_page( page.redirect_title, options ))
17
+ end
18
+ page
19
+ end
20
+
21
+ def find_image( title, options = {} )
22
+ title = Url.new(title).title rescue title
23
+ Page.new( request_image( title, options ) )
24
+ end
25
+
26
+ # http://en.wikipedia.org/w/api.php?action=query&format=json&prop=revisions%7Clinks%7Cimages%7Ccategories&rvprop=content&titles=Flower%20(video%20game)
27
+ def request_page( title, options = {} )
28
+ request( {
29
+ :action => "query",
30
+ :prop => %w{ revisions links images categories },
31
+ :rvprop => "content",
32
+ :titles => title
33
+ }.merge( options ) )
34
+ end
35
+
36
+ # http://en.wikipedia.org/w/api.php?action=query&format=json&prop=imageinfo&iiprop=url&titles=File:Flower.png
37
+ def request_image( title, options = {} )
38
+ request( {
39
+ :action => "query",
40
+ :prop => "imageinfo",
41
+ :iiprop => "url",
42
+ :titles => title
43
+ }.merge( options ) )
44
+ end
45
+
46
+ def request( options )
47
+ require 'open-uri'
48
+ URI.parse( url_for( options ) ).read( "User-Agent" => "Ruby/#{RUBY_VERSION}" )
49
+ end
50
+
51
+ protected
52
+ def configuration_options
53
+ {
54
+ :domain => Configuration[:domain],
55
+ :path => Configuration[:path]
56
+ }
57
+ end
58
+
59
+ def url_for( options )
60
+ url = BASE_URL.dup
61
+ options = configuration_options.merge( options )
62
+ options.each do |key, val|
63
+ value = urlify_value( val )
64
+ if url.include?( ":#{key}" )
65
+ url.sub! ":#{key}", value
66
+ else
67
+ url << "&#{key}=#{value}"
68
+ end
69
+ end
70
+ url
71
+ end
72
+
73
+ def urlify_value( val )
74
+ case val
75
+ when Array
76
+ encode( val.flatten.join( '|' ) )
77
+ else
78
+ encode( val )
79
+ end
80
+ end
81
+
82
+ def encode( val )
83
+ case val
84
+ when String
85
+ URI.encode( val ).gsub( '&', '%26' )
86
+ else
87
+ val
88
+ end
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,25 @@
1
+ require 'singleton'
2
+
3
+ module Wikipedia
4
+ class Configuration
5
+ include Singleton
6
+
7
+ def self.directives(*directives)
8
+ directives.each do |directive|
9
+ define_method directive do |*args|
10
+ if args.empty?
11
+ return instance_variable_get("@#{directive}")
12
+ else
13
+ instance_variable_set("@#{directive}", args.first)
14
+ end
15
+ end
16
+ end
17
+ end
18
+
19
+ def self.[](directive)
20
+ instance.send(directive)
21
+ end
22
+
23
+ directives :domain, :path
24
+ end
25
+ end
@@ -0,0 +1,109 @@
1
+ module Wikipedia
2
+ class Page
3
+ def initialize(json)
4
+ require 'json'
5
+ @json = json
6
+ @data = JSON::load(json)
7
+ end
8
+
9
+ def page
10
+ @data['query']['pages'].values.first
11
+ end
12
+
13
+ def content
14
+ page['revisions'].first.values.first if page['revisions']
15
+ end
16
+
17
+ def sanitized_content
18
+ self.class.sanitize(content)
19
+ end
20
+
21
+ def redirect?
22
+ content && content.match(/\#REDIRECT\s+\[\[(.*?)\]\]/i)
23
+ end
24
+
25
+ def redirect_title
26
+ if matches = redirect?
27
+ matches[1]
28
+ end
29
+ end
30
+
31
+ def title
32
+ page['title']
33
+ end
34
+
35
+ def categories
36
+ page['categories'].map {|c| c['title'] } if page['categories']
37
+ end
38
+
39
+ def links
40
+ page['links'].map {|c| c['title'] } if page['links']
41
+ end
42
+
43
+ def images
44
+ page['images'].map {|c| c['title'] } if page['images']
45
+ end
46
+
47
+ def image_url
48
+ page['imageinfo'].first['url'] if page['imageinfo']
49
+ end
50
+
51
+ def image_urls
52
+ if list = images
53
+ filtered = list.select {|i| i =~ /^file:.+\.(jpg|jpeg|png|gif)$/i && !i.include?("LinkFA-star") }
54
+ filtered.map do |title|
55
+ Wikipedia.find_image( title ).image_url
56
+ end
57
+ end
58
+ end
59
+
60
+ def raw_data
61
+ @data
62
+ end
63
+
64
+ def json
65
+ @json
66
+ end
67
+
68
+ def self.sanitize( s )
69
+ if s
70
+ s = s.dup
71
+
72
+ # strip anything inside curly braces!
73
+ while s =~ /\{\{[^\{\}]+?\}\}/
74
+ s.gsub!(/\{\{[^\{\}]+?\}\}/, '')
75
+ end
76
+
77
+ # strip info box
78
+ s.sub!(/^\{\|[^\{\}]+?\n\|\}\n/, '')
79
+
80
+ # strip internal links
81
+ s.gsub!(/\[\[([^\]\|]+?)\|([^\]\|]+?)\]\]/, '\2')
82
+ s.gsub!(/\[\[([^\]\|]+?)\]\]/, '\1')
83
+
84
+ # strip images and file links
85
+ s.gsub!(/\[\[Image:[^\[\]]+?\]\]/, '')
86
+ s.gsub!(/\[\[File:[^\[\]]+?\]\]/, '')
87
+
88
+ # convert bold/italic to html
89
+ s.gsub!(/'''''(.+?)'''''/, '<b><i>\1</i></b>')
90
+ s.gsub!(/'''(.+?)'''/, '<b>\1</b>')
91
+ s.gsub!(/''(.+?)''/, '<i>\1</i>')
92
+
93
+ # misc
94
+ s.gsub!(/<ref[^<>]*>[\s\S]*?<\/ref>/, '')
95
+ s.gsub!(/<!--[^>]+?-->/, '')
96
+ s.gsub!(' ', ' ')
97
+ s.strip!
98
+
99
+ # create paragraphs
100
+ sections = s.split("\n\n")
101
+ if sections.size > 1
102
+ s = sections.map {|s| "<p>#{s.strip}</p>" }.join("\n")
103
+ end
104
+
105
+ s
106
+ end
107
+ end
108
+ end
109
+ end