freescrape 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ === 0.0.9 / 2008-09-28
2
+
3
+ * Initial release.
4
+ * Can request items from freebase.com using either a URL, an Item GUID or an
5
+ Item name.
6
+ * Preserves tags and other freebase links.
7
+
@@ -0,0 +1,14 @@
1
+ History.txt
2
+ Manifest.txt
3
+ README.txt
4
+ Rakefile
5
+ lib/freebase.rb
6
+ lib/freebase/item_link.rb
7
+ lib/freebase/category.rb
8
+ lib/freebase/item.rb
9
+ lib/freebase/freebase.rb
10
+ lib/freebase/version.rb
11
+ tasks/spec.rb
12
+ spec/item_spec.rb
13
+ spec/freebase_spec.rb
14
+ spec/spec_helper.rb
@@ -0,0 +1,35 @@
1
+ = FreeBase
2
+
3
+ * http://freebase.rubyforge.org/
4
+ * Postmodern (postmodern.mod3@gmail.com)
5
+
6
+ == DESCRIPTION:
7
+
8
+ A web-scraping interface to freebase.com, the open and shared database of the
9
+ world's knowledge.
10
+
11
+ == FEATURES/PROBLEMS:
12
+
13
+ * Can request items from freebase.com using either a URL, an Item GUID or an
14
+ Item name.
15
+ * Preserves tags and other freebase links.
16
+
17
+ == EXAMPLES:
18
+
19
+ require 'freebase'
20
+
21
+ Freebase.item('Aphex Twin')
22
+ # => #<Freebase::Item:0xb73fdba0 ...>
23
+
24
+ Freebase.item('http://www.freebase.com/view/guid/9202a8c04000641f8000000003ac957f')
25
+ # => #<Freebase::Item:0xb73fe3dc ...>
26
+
27
+ == REQUIREMENTS:
28
+
29
+ * Hpricot
30
+ * WWW::Mechanize
31
+
32
+ == INSTALL:
33
+
34
+ $ sudo gem install freescrape
35
+
@@ -0,0 +1,15 @@
1
+ # -*- ruby -*-
2
+
3
+ require 'rubygems'
4
+ require 'hoe'
5
+ require './tasks/spec.rb'
6
+ require './lib/freebase/version.rb'
7
+
8
+ Hoe.new('freescrape', Freebase::VERSION) do |p|
9
+ p.rubyforge_name = 'freebase'
10
+ p.remote_rdoc_dir = ''
11
+ p.developer('Postmodern Modulus III', 'postmodern.mod3@gmail.com')
12
+ p.extra_deps = ['hpricot', 'mechanize']
13
+ end
14
+
15
+ # vim: syntax=Ruby
@@ -0,0 +1,3 @@
1
+ require 'freebase/item'
2
+ require 'freebase/freebase'
3
+ require 'freebase/version'
@@ -0,0 +1,33 @@
1
+ module Freebase
2
+ class Category
3
+
4
+ # Name of the category
5
+ attr_reader :name
6
+
7
+ # Metadata associated with the category
8
+ attr_reader :metadata
9
+
10
+ #
11
+ # Creates a new Category object with the specified _name_.
12
+ #
13
+ def initialize(name)
14
+ @name = name
15
+ @metadata = {}
16
+ end
17
+
18
+ #
19
+ # Returns the metadata with the specified _name_.
20
+ #
21
+ def [](name)
22
+ @metadata[name]
23
+ end
24
+
25
+ #
26
+ # Returns the name of the category in +String+ form.
27
+ #
28
+ def to_s
29
+ @name.to_s
30
+ end
31
+
32
+ end
33
+ end
@@ -0,0 +1,185 @@
1
+ require 'freebase/item'
2
+ require 'freebase/item_link'
3
+
4
+ require 'www/mechanize'
5
+ require 'hpricot'
6
+ require 'open-uri'
7
+
8
+ module Freebase
9
+ # Common proxy port
10
+ COMMON_PROXY_PORT = 8080
11
+
12
+ # Default language
13
+ DEFAULT_LANGUAGE = :en
14
+
15
+ #
16
+ # Returns the +Hash+ of proxy information.
17
+ #
18
+ def Freebase.proxy
19
+ @@free_base_proxy ||= {
20
+ :host => nil,
21
+ :port => COMMON_PROXY_PORT,
22
+ :user => nil,
23
+ :password => nil
24
+ }
25
+ end
26
+
27
+ #
28
+ # Creates a HTTP URI based from the given _proxy_info_ hash. The
29
+ # _proxy_info_ hash defaults to Web.proxy, if not given.
30
+ #
31
+ # _proxy_info_ may contain the following keys:
32
+ # <tt>:host</tt>:: The proxy host.
33
+ # <tt>:port</tt>:: The proxy port. Defaults to COMMON_PROXY_PORT,
34
+ # if not specified.
35
+ # <tt>:user</tt>:: The user-name to login as.
36
+ # <tt>:password</tt>:: The password to login with.
37
+ #
38
+ def Freebase.proxy_uri(proxy_info=Freebase.proxy)
39
+ if Freebase.proxy[:host]
40
+ return URI::HTTP.build(:host => Freebase.proxy[:host],
41
+ :port => Freebase.proxy[:port],
42
+ :userinfo => "#{Freebase.proxy[:user]}:#{Freebase.proxy[:password]}",
43
+ :path => '/')
44
+ end
45
+ end
46
+
47
+ #
48
+ # Returns the supported Freebase User-Agent Aliases.
49
+ #
50
+ def Freebase.user_agent_aliases
51
+ WWW::Mechanize::AGENT_ALIASES
52
+ end
53
+
54
+ #
55
+ # Returns the Freebase User-Agent
56
+ #
57
+ def Freebase.user_agent
58
+ @@free_base_user_agent ||= Freebase.user_agent_aliases['Windows IE 6']
59
+ end
60
+
61
+ #
62
+ # Sets the Freebase User-Agent to the specified _agent_.
63
+ #
64
+ def Freebase.user_agent=(agent)
65
+ @@free_base_user_agent = agent
66
+ end
67
+
68
+ #
69
+ # Sets the Freebase User-Agent using the specified user-agent alias
70
+ # _name_.
71
+ #
72
+ def Freebase.user_agent_alias=(name)
73
+ @@free_base_user_agent = Freebase.user_agent_aliases[name.to_s]
74
+ end
75
+
76
+ #
77
+ # Opens the _uri_ with the given _options_. The contents of the _uri_
78
+ # will be returned.
79
+ #
80
+ # _options_ may contain the following keys:
81
+ # <tt>:user_agent_alias</tt>:: The User-Agent Alias to use.
82
+ # <tt>:user_agent</tt>:: The User-Agent String to use.
83
+ # <tt>:proxy</tt>:: A +Hash+ of proxy information which may
84
+ # contain the following keys:
85
+ # <tt>:host</tt>:: The proxy host.
86
+ # <tt>:port</tt>:: The proxy port.
87
+ # <tt>:user</tt>:: The user-name to login as.
88
+ # <tt>:password</tt>:: The password to login with.
89
+ #
90
+ # Freebase.open_uri('http://www.hackety.org/')
91
+ #
92
+ # Freebase.open_uri('http://tenderlovemaking.com/',
93
+ # :user_agent_alias => 'Linux Mozilla')
94
+ # Freebase.open_uri('http://www.wired.com/',
95
+ # :user_agent => 'the future')
96
+ #
97
+ def Freebase.open_uri(uri,options={})
98
+ headers = {}
99
+
100
+ if options[:user_agent_alias]
101
+ headers['User-Agent'] = WWW::Mechanize::AGENT_ALIASES[options[:user_agent_alias]]
102
+ elsif options[:user_agent]
103
+ headers['User-Agent'] = options[:user_agent]
104
+ elsif Freebase.user_agent
105
+ headers['User-Agent'] = Freebase.user_agent
106
+ end
107
+
108
+ proxy = (options[:proxy] || Freebase.proxy)
109
+ if proxy[:host]
110
+ headers[:proxy] = Freebase.proxy_uri(proxy)
111
+ end
112
+
113
+ return Kernel.open(uri,headers)
114
+ end
115
+
116
+ #
117
+ # Similar to Freebase.open_uri but returns an Hpricot document.
118
+ #
119
+ def Freebase.open_page(uri,options={})
120
+ Hpricot(Freebase.open_uri(uri,options))
121
+ end
122
+
123
+ #
124
+ # Creates a new WWW::Mechanize agent with the given _options_.
125
+ #
126
+ # _options_ may contain the following keys:
127
+ # <tt>:user_agent_alias</tt>:: The User-Agent Alias to use.
128
+ # <tt>:user_agent</tt>:: The User-Agent string to use.
129
+ # <tt>:proxy</tt>:: A +Hash+ of proxy information which may
130
+ # contain the following keys:
131
+ # <tt>:host</tt>:: The proxy host.
132
+ # <tt>:port</tt>:: The proxy port.
133
+ # <tt>:user</tt>:: The user-name to login as.
134
+ # <tt>:password</tt>:: The password to login with.
135
+ #
136
+ # Freebase.web_agent
137
+ #
138
+ # Freebase.web_agent(:user_agent_alias => 'Linux Mozilla')
139
+ # Freebase.web_agent(:user_agent => 'Google Bot')
140
+ #
141
+ def Freebase.web_agent(options={},&block)
142
+ agent = WWW::Mechanize.new
143
+
144
+ if options[:user_agent_alias]
145
+ agent.user_agent_alias = options[:user_agent_alias]
146
+ elsif options[:user_agent]
147
+ agent.user_agent = options[:user_agent]
148
+ elsif Freebase.user_agent
149
+ agent.user_agent = Freebase.user_agent
150
+ end
151
+
152
+ proxy = (options[:proxy] || Freebase.proxy)
153
+ if proxy[:host]
154
+ agent.set_proxy(proxy[:host],proxy[:port],proxy[:user],proxy[:password])
155
+ end
156
+
157
+ block.call(agent) if block
158
+ return agent
159
+ end
160
+
161
+ #
162
+ # Returns the language to access Freebase with.
163
+ #
164
+ def Freebase.language
165
+ @@free_base_language ||= DEFAULT_LANGUAGE
166
+ end
167
+
168
+ #
169
+ # Sets the language to access Freebase with to the _new_language_.
170
+ #
171
+ def Freebase.language=(new_language)
172
+ @@free_base_language = new_language.to_sym
173
+ end
174
+
175
+ #
176
+ # Returns the Item with the specified _descriptor_, which can be either
177
+ # a URI to freebase.com, an Item GUID or an Item name.
178
+ #
179
+ # Freebase.item('Aphex Twin')
180
+ # # => #<Freebase::Item:0xb73fdba0 ...>
181
+ #
182
+ def Freebase.item(descriptor)
183
+ Item.from(descriptor)
184
+ end
185
+ end
@@ -0,0 +1,171 @@
1
+ require 'freebase/item_link'
2
+ require 'freebase/category'
3
+
4
+ require 'uri'
5
+
6
+ module Freebase
7
+ class Item
8
+
9
+ # URL of the item
10
+ attr_reader :url
11
+
12
+ # Name of the item
13
+ attr_accessor :name
14
+
15
+ # Description of the item
16
+ attr_accessor :summary
17
+
18
+ # Categories the item is in
19
+ attr_reader :categories
20
+
21
+ #
22
+ # Creates a new Item object with the specified _name_ and the given
23
+ # _options_.
24
+ #
25
+ def initialize(url,options={})
26
+ @url = url
27
+ @name = options[:name]
28
+ @summary = options[:summary]
29
+
30
+ @categories = {}
31
+ end
32
+
33
+ #
34
+ # Returns the Item object with the specified _descriptor_, which can
35
+ # be either a URI to freebase.com, an Item GUID or an Item name.
36
+ #
37
+ def Item.from(descriptor)
38
+ descriptor = descriptor.to_s
39
+
40
+ if descriptor =~ /^[0-9a-f]+$/
41
+ return Item.guid(descriptor)
42
+ elsif descriptor =~ /^http(s)?:\/\/(www\.)?freebase.com\/view/
43
+ return Item.from_url(descriptor)
44
+ else
45
+ return Item.named(descriptor)
46
+ end
47
+ end
48
+
49
+ #
50
+ # Returns the Item object with the specified _name_.
51
+ #
52
+ def Item.named(name)
53
+ name = name.split(' ').map { |word|
54
+ word.downcase
55
+ }.join('_')
56
+
57
+ return Item.from_url("http://www.freebase.com/view/#{Freebase.language}/#{name}")
58
+ end
59
+
60
+ #
61
+ # Returns the Item object with the specified _guid_.
62
+ #
63
+ def Item.guid(guid)
64
+ Item.from_url("http://www.freebase.com/view/guid/#{guid}")
65
+ end
66
+
67
+ #
68
+ # Creates the Item at the specified _url_.
69
+ #
70
+ # Item.from_url('http://www.freebase.com/view/guid/9202a8c04000641f800000000301146f')
71
+ # # => #<Freebase::Item:0xb73fdba0 ...>
72
+ #
73
+ def Item.from_url(url)
74
+ url = URI(url.to_s)
75
+ page = Freebase.open_page(url)
76
+ new_item = Item.new(url)
77
+
78
+ content = page.at('#content_main')
79
+
80
+ new_item.name = content.at('#title//h1').inner_text.strip
81
+ new_item.summary = content.at('#title/div.article-container/div.article').inner_html.strip
82
+
83
+ extract_value = lambda { |elem|
84
+ if (item_link = elem.at('a.pv'))
85
+ link_url = new_item.url.merge(item_link['href'])
86
+
87
+ ItemLink.new(item_link.inner_text.strip, link_url)
88
+ elsif elem.at('a.detail-view').nil?
89
+ text = elem.inner_text.strip
90
+
91
+ if text.empty?
92
+ nil
93
+ else
94
+ text
95
+ end
96
+ end
97
+ }
98
+
99
+ content.search('div.domainsboxes//div.domainbox//div.typebox-container') do |domainbox|
100
+ category_name = domainbox.at('//div.typebox-column-title/a').inner_text
101
+ new_category = Category.new(category_name)
102
+
103
+ domainbox.search('//div.prop-typebox') do |field|
104
+ field_name = field.at('//span.prop-title').inner_text
105
+ field_content = field.at('//div.prop-content')
106
+
107
+ field_value = nil
108
+
109
+ if (table = field_content.at('table.prop-table'))
110
+ field_value = []
111
+
112
+ column_names = table.search('tr/th/div.prop-table-cell').map do |div|
113
+ div.inner_text.strip
114
+ end
115
+
116
+ table.search('tr[td]') do |row|
117
+ field_row = {}
118
+ index = 0
119
+
120
+ row.search('td') do |cell|
121
+ if (value = extract_value.call(cell))
122
+ field_row[column_names[index]] = value
123
+ end
124
+
125
+ index += 1
126
+ end
127
+
128
+ field_value << field_row unless field_row.empty?
129
+ end
130
+ elsif (list = field_content.at('ul.prop-list'))
131
+ field_value = []
132
+
133
+ list.search('li.prop-list-item') do |list_item|
134
+ if (value = extract_value.call(list_item))
135
+ field_value << value
136
+ end
137
+ end
138
+ end
139
+
140
+ new_category.metadata[field_name] = field_value
141
+ end
142
+
143
+ new_item.categories[new_category.name] = new_category
144
+ end
145
+
146
+ return new_item
147
+ end
148
+
149
+ #
150
+ # Returns the category names of the item.
151
+ #
152
+ def category_names
153
+ @categories.keys
154
+ end
155
+
156
+ #
157
+ # Returns the Category with the specified _name_ of the item.
158
+ #
159
+ def [](name)
160
+ @categories[name]
161
+ end
162
+
163
+ #
164
+ # Returns the name of the item.
165
+ #
166
+ def to_s
167
+ @name.to_s
168
+ end
169
+
170
+ end
171
+ end
@@ -0,0 +1,26 @@
1
+ module Freebase
2
+ class ItemLink
3
+
4
+ # Title of the item
5
+ attr_reader :title
6
+
7
+ # URL of the item
8
+ attr_reader :url
9
+
10
+ #
11
+ # Creates a new ItemLink with the specified _title_ and _url_.
12
+ #
13
+ def initialize(title,url)
14
+ @title = title
15
+ @url = url
16
+ end
17
+
18
+ #
19
+ # Returns the title of the item-link in +String+ form.
20
+ #
21
+ def to_s
22
+ @title.to_s
23
+ end
24
+
25
+ end
26
+ end
@@ -0,0 +1,3 @@
1
+ module Freebase
2
+ VERSION = '0.0.9'
3
+ end
@@ -0,0 +1,33 @@
1
+ require 'spec_helper'
2
+
3
+ describe Freebase do
4
+ it "should have a version" do
5
+ Freebase.const_get('VERSION').should_not be_nil
6
+ end
7
+
8
+ it "should have a default language" do
9
+ Freebase.language.should_not be_nil
10
+ end
11
+
12
+ it "should have a default User-Agent string" do
13
+ Freebase.user_agent.should_not be_nil
14
+ end
15
+
16
+ it "should return an item from a given URL" do
17
+ @item = Freebase.item('http://www.freebase.com/view/en/squarepusher/')
18
+ @item.should_not be_nil
19
+ @item.name.should == 'Squarepusher'
20
+ end
21
+
22
+ it "should return an item from a given GUID" do
23
+ @item = Freebase.item('9202a8c04000641f8000000000184c7a')
24
+ @item.should_not be_nil
25
+ @item.name.should == 'Conflict'
26
+ end
27
+
28
+ it "should return an item for a given title" do
29
+ @item = Freebase.item('Aphex Twin')
30
+ @item.should_not be_nil
31
+ @item.name.should == 'Aphex Twin'
32
+ end
33
+ end
@@ -0,0 +1,34 @@
1
+ require 'spec_helper'
2
+
3
+ describe Item do
4
+ before(:all) do
5
+ @item = Item.from_url('http://www.freebase.com/view/en/aphex_twin')
6
+ end
7
+
8
+ it "should have a URL" do
9
+ @item.url.should_not be_nil
10
+ end
11
+
12
+ it "should have a name" do
13
+ @item.name.should_not be_nil
14
+ @item.name.should_not be_empty
15
+ end
16
+
17
+ it "should have categories" do
18
+ @item.categories.should_not be_empty
19
+ end
20
+
21
+ it "should have category names" do
22
+ @item.category_names.should_not be_empty
23
+ end
24
+
25
+ it "should have metadata for each category" do
26
+ @item.categories.each_value do |category|
27
+ category.metadata.should_not be_empty
28
+
29
+ category.metadata.each_value do |data|
30
+ data.should_not be_nil
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,7 @@
1
+ require 'rubygems'
2
+ gem 'rspec', '>=1.1.3'
3
+ require 'spec'
4
+
5
+ require 'freebase'
6
+
7
+ include Freebase
@@ -0,0 +1,7 @@
1
+ require 'spec/rake/spectask'
2
+
3
+ desc "Run all specifications"
4
+ Spec::Rake::SpecTask.new(:spec) do |t|
5
+ t.libs += ['lib', 'spec']
6
+ t.spec_opts = ['--colour', '--format', 'specdoc']
7
+ end
metadata ADDED
@@ -0,0 +1,99 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: freescrape
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.9
5
+ platform: ruby
6
+ authors:
7
+ - Postmodern Modulus III
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-09-29 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hpricot
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: mechanize
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: "0"
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: hoe
37
+ type: :development
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: 1.7.0
44
+ version:
45
+ description: A web-scraping interface to freebase.com, the open and shared database of the world's knowledge.
46
+ email:
47
+ - postmodern.mod3@gmail.com
48
+ executables: []
49
+
50
+ extensions: []
51
+
52
+ extra_rdoc_files:
53
+ - History.txt
54
+ - Manifest.txt
55
+ - README.txt
56
+ files:
57
+ - History.txt
58
+ - Manifest.txt
59
+ - README.txt
60
+ - Rakefile
61
+ - lib/freebase.rb
62
+ - lib/freebase/item_link.rb
63
+ - lib/freebase/category.rb
64
+ - lib/freebase/item.rb
65
+ - lib/freebase/freebase.rb
66
+ - lib/freebase/version.rb
67
+ - tasks/spec.rb
68
+ - spec/item_spec.rb
69
+ - spec/freebase_spec.rb
70
+ - spec/spec_helper.rb
71
+ has_rdoc: true
72
+ homepage: http://freebase.rubyforge.org/
73
+ post_install_message:
74
+ rdoc_options:
75
+ - --main
76
+ - README.txt
77
+ require_paths:
78
+ - lib
79
+ required_ruby_version: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ version: "0"
84
+ version:
85
+ required_rubygems_version: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: "0"
90
+ version:
91
+ requirements: []
92
+
93
+ rubyforge_project: freebase
94
+ rubygems_version: 1.2.0
95
+ signing_key:
96
+ specification_version: 2
97
+ summary: A web-scraping interface to freebase.com, the open and shared database of the world's knowledge.
98
+ test_files: []
99
+