free-scrape 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,12 @@
1
+ === 0.1.0 / 2008-09-29
2
+
3
+ * Renamed freebase to free-scrape, in order to not conflict with the other
4
+ freebase gem from the freebaseapi project.
5
+
6
+ === 0.0.9 / 2008-09-28
7
+
8
+ * Initial release.
9
+ * Can request items from freebase.com using either a URL, an Item GUID or an
10
+ Item name.
11
+ * Preserves tags and other freebase links.
12
+
@@ -0,0 +1,14 @@
1
+ History.txt
2
+ Manifest.txt
3
+ README.txt
4
+ Rakefile
5
+ lib/free_scrape.rb
6
+ lib/free_scrape/item_link.rb
7
+ lib/free_scrape/category.rb
8
+ lib/free_scrape/item.rb
9
+ lib/free_scrape/free_scrape.rb
10
+ lib/free_scrape/version.rb
11
+ tasks/spec.rb
12
+ spec/item_spec.rb
13
+ spec/free_scrape_spec.rb
14
+ spec/spec_helper.rb
@@ -0,0 +1,35 @@
1
+ = FreeBase
2
+
3
+ * http://freebase.rubyforge.org/
4
+ * Postmodern (postmodern.mod3@gmail.com)
5
+
6
+ == DESCRIPTION:
7
+
8
+ A web-scraping interface to freebase.com, the open and shared database of the
9
+ world's knowledge.
10
+
11
+ == FEATURES/PROBLEMS:
12
+
13
+ * Can request items from freebase.com using either a URL, an Item GUID or an
14
+ Item name.
15
+ * Preserves tags and other freebase links.
16
+
17
+ == EXAMPLES:
18
+
19
+ require 'free_scrape'
20
+
21
+ FreeScrape.item('Aphex Twin')
22
+ # => #<FreeScrape::Item:0xb73fdba0 ...>
23
+
24
+ FreeScrape.item('http://www.freebase.com/view/guid/9202a8c04000641f8000000003ac957f')
25
+ # => #<FreeScrape::Item:0xb73fe3dc ...>
26
+
27
+ == REQUIREMENTS:
28
+
29
+ * Hpricot
30
+ * WWW::Mechanize
31
+
32
+ == INSTALL:
33
+
34
+ $ sudo gem install free-scrape
35
+
@@ -0,0 +1,15 @@
1
+ # -*- ruby -*-
2
+
3
+ require 'rubygems'
4
+ require 'hoe'
5
+ require './tasks/spec.rb'
6
+ require './lib/free_scrape/version.rb'
7
+
8
+ Hoe.new('free-scrape', FreeScrape::VERSION) do |p|
9
+ p.rubyforge_name = 'freebase'
10
+ p.remote_rdoc_dir = ''
11
+ p.developer('Postmodern Modulus III', 'postmodern.mod3@gmail.com')
12
+ p.extra_deps = ['hpricot', 'mechanize']
13
+ end
14
+
15
+ # vim: syntax=Ruby
@@ -0,0 +1,3 @@
1
+ require 'free_scrape/item'
2
+ require 'free_scrape/free_scrape'
3
+ require 'free_scrape/version'
@@ -0,0 +1,33 @@
1
+ module FreeScrape
2
+ class Category
3
+
4
+ # Name of the category
5
+ attr_reader :name
6
+
7
+ # Metadata associated with the category
8
+ attr_reader :metadata
9
+
10
+ #
11
+ # Creates a new Category object with the specified _name_.
12
+ #
13
+ def initialize(name)
14
+ @name = name
15
+ @metadata = {}
16
+ end
17
+
18
+ #
19
+ # Returns the metadata with the specified _name_.
20
+ #
21
+ def [](name)
22
+ @metadata[name]
23
+ end
24
+
25
+ #
26
+ # Returns the name of the category in +String+ form.
27
+ #
28
+ def to_s
29
+ @name.to_s
30
+ end
31
+
32
+ end
33
+ end
@@ -0,0 +1,185 @@
1
+ require 'free_scrape/item'
2
+ require 'free_scrape/item_link'
3
+
4
+ require 'www/mechanize'
5
+ require 'hpricot'
6
+ require 'open-uri'
7
+
8
+ module FreeScrape
9
+ # Common proxy port
10
+ COMMON_PROXY_PORT = 8080
11
+
12
+ # Default language
13
+ DEFAULT_LANGUAGE = :en
14
+
15
+ #
16
+ # Returns the +Hash+ of proxy information.
17
+ #
18
+ def FreeScrape.proxy
19
+ @@free_scrape_proxy ||= {
20
+ :host => nil,
21
+ :port => COMMON_PROXY_PORT,
22
+ :user => nil,
23
+ :password => nil
24
+ }
25
+ end
26
+
27
+ #
28
+ # Creates a HTTP URI based from the given _proxy_info_ hash. The
29
+ # _proxy_info_ hash defaults to Web.proxy, if not given.
30
+ #
31
+ # _proxy_info_ may contain the following keys:
32
+ # <tt>:host</tt>:: The proxy host.
33
+ # <tt>:port</tt>:: The proxy port. Defaults to COMMON_PROXY_PORT,
34
+ # if not specified.
35
+ # <tt>:user</tt>:: The user-name to login as.
36
+ # <tt>:password</tt>:: The password to login with.
37
+ #
38
+ def FreeScrape.proxy_uri(proxy_info=FreeScrape.proxy)
39
+ if FreeScrape.proxy[:host]
40
+ return URI::HTTP.build(:host => FreeScrape.proxy[:host],
41
+ :port => FreeScrape.proxy[:port],
42
+ :userinfo => "#{FreeScrape.proxy[:user]}:#{FreeScrape.proxy[:password]}",
43
+ :path => '/')
44
+ end
45
+ end
46
+
47
+ #
48
+ # Returns the supported FreeScrape User-Agent Aliases.
49
+ #
50
+ def FreeScrape.user_agent_aliases
51
+ WWW::Mechanize::AGENT_ALIASES
52
+ end
53
+
54
+ #
55
+ # Returns the FreeScrape User-Agent
56
+ #
57
+ def FreeScrape.user_agent
58
+ @@free_scrape_user_agent ||= FreeScrape.user_agent_aliases['Windows IE 6']
59
+ end
60
+
61
+ #
62
+ # Sets the FreeScrape User-Agent to the specified _agent_.
63
+ #
64
+ def FreeScrape.user_agent=(agent)
65
+ @@free_scrape_user_agent = agent
66
+ end
67
+
68
+ #
69
+ # Sets the FreeScrape User-Agent using the specified user-agent alias
70
+ # _name_.
71
+ #
72
+ def FreeScrape.user_agent_alias=(name)
73
+ @@free_scrape_user_agent = FreeScrape.user_agent_aliases[name.to_s]
74
+ end
75
+
76
+ #
77
+ # Opens the _uri_ with the given _options_. The contents of the _uri_
78
+ # will be returned.
79
+ #
80
+ # _options_ may contain the following keys:
81
+ # <tt>:user_agent_alias</tt>:: The User-Agent Alias to use.
82
+ # <tt>:user_agent</tt>:: The User-Agent String to use.
83
+ # <tt>:proxy</tt>:: A +Hash+ of proxy information which may
84
+ # contain the following keys:
85
+ # <tt>:host</tt>:: The proxy host.
86
+ # <tt>:port</tt>:: The proxy port.
87
+ # <tt>:user</tt>:: The user-name to login as.
88
+ # <tt>:password</tt>:: The password to login with.
89
+ #
90
+ # FreeScrape.open_uri('http://www.hackety.org/')
91
+ #
92
+ # FreeScrape.open_uri('http://tenderlovemaking.com/',
93
+ # :user_agent_alias => 'Linux Mozilla')
94
+ # FreeScrape.open_uri('http://www.wired.com/',
95
+ # :user_agent => 'the future')
96
+ #
97
+ def FreeScrape.open_uri(uri,options={})
98
+ headers = {}
99
+
100
+ if options[:user_agent_alias]
101
+ headers['User-Agent'] = WWW::Mechanize::AGENT_ALIASES[options[:user_agent_alias]]
102
+ elsif options[:user_agent]
103
+ headers['User-Agent'] = options[:user_agent]
104
+ elsif FreeScrape.user_agent
105
+ headers['User-Agent'] = FreeScrape.user_agent
106
+ end
107
+
108
+ proxy = (options[:proxy] || FreeScrape.proxy)
109
+ if proxy[:host]
110
+ headers[:proxy] = FreeScrape.proxy_uri(proxy)
111
+ end
112
+
113
+ return Kernel.open(uri,headers)
114
+ end
115
+
116
+ #
117
+ # Similar to FreeScrape.open_uri but returns an Hpricot document.
118
+ #
119
+ def FreeScrape.open_page(uri,options={})
120
+ Hpricot(FreeScrape.open_uri(uri,options))
121
+ end
122
+
123
+ #
124
+ # Creates a new WWW::Mechanize agent with the given _options_.
125
+ #
126
+ # _options_ may contain the following keys:
127
+ # <tt>:user_agent_alias</tt>:: The User-Agent Alias to use.
128
+ # <tt>:user_agent</tt>:: The User-Agent string to use.
129
+ # <tt>:proxy</tt>:: A +Hash+ of proxy information which may
130
+ # contain the following keys:
131
+ # <tt>:host</tt>:: The proxy host.
132
+ # <tt>:port</tt>:: The proxy port.
133
+ # <tt>:user</tt>:: The user-name to login as.
134
+ # <tt>:password</tt>:: The password to login with.
135
+ #
136
+ # FreeScrape.web_agent
137
+ #
138
+ # FreeScrape.web_agent(:user_agent_alias => 'Linux Mozilla')
139
+ # FreeScrape.web_agent(:user_agent => 'Google Bot')
140
+ #
141
+ def FreeScrape.web_agent(options={},&block)
142
+ agent = WWW::Mechanize.new
143
+
144
+ if options[:user_agent_alias]
145
+ agent.user_agent_alias = options[:user_agent_alias]
146
+ elsif options[:user_agent]
147
+ agent.user_agent = options[:user_agent]
148
+ elsif FreeScrape.user_agent
149
+ agent.user_agent = FreeScrape.user_agent
150
+ end
151
+
152
+ proxy = (options[:proxy] || FreeScrape.proxy)
153
+ if proxy[:host]
154
+ agent.set_proxy(proxy[:host],proxy[:port],proxy[:user],proxy[:password])
155
+ end
156
+
157
+ block.call(agent) if block
158
+ return agent
159
+ end
160
+
161
+ #
162
+ # Returns the language to access FreeScrape with.
163
+ #
164
+ def FreeScrape.language
165
+ @@free_scrape_language ||= DEFAULT_LANGUAGE
166
+ end
167
+
168
+ #
169
+ # Sets the language to access FreeScrape with to the _new_language_.
170
+ #
171
+ def FreeScrape.language=(new_language)
172
+ @@free_scrape_language = new_language.to_sym
173
+ end
174
+
175
+ #
176
+ # Returns the Item with the specified _descriptor_, which can be either
177
+ # a URI to freebase.com, an Item GUID or an Item name.
178
+ #
179
+ # FreeScrape.item('Aphex Twin')
180
+ # # => #<FreeScrape::Item:0xb73fdba0 ...>
181
+ #
182
+ def FreeScrape.item(descriptor)
183
+ Item.from(descriptor)
184
+ end
185
+ end
@@ -0,0 +1,171 @@
1
+ require 'free_scrape/item_link'
2
+ require 'free_scrape/category'
3
+
4
+ require 'uri'
5
+
6
+ module FreeScrape
7
+ class Item
8
+
9
+ # URL of the item
10
+ attr_reader :url
11
+
12
+ # Name of the item
13
+ attr_accessor :name
14
+
15
+ # Description of the item
16
+ attr_accessor :summary
17
+
18
+ # Categories the item is in
19
+ attr_reader :categories
20
+
21
+ #
22
+ # Creates a new Item object with the specified _name_ and the given
23
+ # _options_.
24
+ #
25
+ def initialize(url,options={})
26
+ @url = url
27
+ @name = options[:name]
28
+ @summary = options[:summary]
29
+
30
+ @categories = {}
31
+ end
32
+
33
+ #
34
+ # Returns the Item object with the specified _descriptor_, which can
35
+ # be either a URI to freebase.com, an Item GUID or an Item name.
36
+ #
37
+ def Item.from(descriptor)
38
+ descriptor = descriptor.to_s
39
+
40
+ if descriptor =~ /^[0-9a-f]+$/
41
+ return Item.guid(descriptor)
42
+ elsif descriptor =~ /^http(s)?:\/\/(www\.)?freebase.com\/view/
43
+ return Item.from_url(descriptor)
44
+ else
45
+ return Item.named(descriptor)
46
+ end
47
+ end
48
+
49
+ #
50
+ # Returns the Item object with the specified _name_.
51
+ #
52
+ def Item.named(name)
53
+ name = name.split(' ').map { |word|
54
+ word.downcase
55
+ }.join('_')
56
+
57
+ return Item.from_url("http://www.freebase.com/view/#{FreeScrape.language}/#{name}")
58
+ end
59
+
60
+ #
61
+ # Returns the Item object with the specified _guid_.
62
+ #
63
+ def Item.guid(guid)
64
+ Item.from_url("http://www.freebase.com/view/guid/#{guid}")
65
+ end
66
+
67
+ #
68
+ # Creates the Item at the specified _url_.
69
+ #
70
+ # Item.from_url('http://www.freebase.com/view/guid/9202a8c04000641f800000000301146f')
71
+ # # => #<FreeScrape::Item:0xb73fdba0 ...>
72
+ #
73
+ def Item.from_url(url)
74
+ url = URI(url.to_s)
75
+ page = FreeScrape.open_page(url)
76
+ new_item = Item.new(url)
77
+
78
+ content = page.at('#content_main')
79
+
80
+ new_item.name = content.at('#title//h1').inner_text.strip
81
+ new_item.summary = content.at('#title/div.article-container/div.article').inner_html.strip
82
+
83
+ extract_value = lambda { |elem|
84
+ if (item_link = elem.at('a.pv'))
85
+ link_url = new_item.url.merge(item_link['href'])
86
+
87
+ ItemLink.new(item_link.inner_text.strip, link_url)
88
+ elsif elem.at('a.detail-view').nil?
89
+ text = elem.inner_text.strip
90
+
91
+ if text.empty?
92
+ nil
93
+ else
94
+ text
95
+ end
96
+ end
97
+ }
98
+
99
+ content.search('div.domainsboxes//div.domainbox//div.typebox-container') do |domainbox|
100
+ category_name = domainbox.at('//div.typebox-column-title/a').inner_text
101
+ new_category = Category.new(category_name)
102
+
103
+ domainbox.search('//div.prop-typebox') do |field|
104
+ field_name = field.at('//span.prop-title').inner_text
105
+ field_content = field.at('//div.prop-content')
106
+
107
+ field_value = nil
108
+
109
+ if (table = field_content.at('table.prop-table'))
110
+ field_value = []
111
+
112
+ column_names = table.search('tr/th/div.prop-table-cell').map do |div|
113
+ div.inner_text.strip
114
+ end
115
+
116
+ table.search('tr[td]') do |row|
117
+ field_row = {}
118
+ index = 0
119
+
120
+ row.search('td') do |cell|
121
+ if (value = extract_value.call(cell))
122
+ field_row[column_names[index]] = value
123
+ end
124
+
125
+ index += 1
126
+ end
127
+
128
+ field_value << field_row unless field_row.empty?
129
+ end
130
+ elsif (list = field_content.at('ul.prop-list'))
131
+ field_value = []
132
+
133
+ list.search('li.prop-list-item') do |list_item|
134
+ if (value = extract_value.call(list_item))
135
+ field_value << value
136
+ end
137
+ end
138
+ end
139
+
140
+ new_category.metadata[field_name] = field_value
141
+ end
142
+
143
+ new_item.categories[new_category.name] = new_category
144
+ end
145
+
146
+ return new_item
147
+ end
148
+
149
+ #
150
+ # Returns the category names of the item.
151
+ #
152
+ def category_names
153
+ @categories.keys
154
+ end
155
+
156
+ #
157
+ # Returns the Category with the specified _name_ of the item.
158
+ #
159
+ def [](name)
160
+ @categories[name]
161
+ end
162
+
163
+ #
164
+ # Returns the name of the item.
165
+ #
166
+ def to_s
167
+ @name.to_s
168
+ end
169
+
170
+ end
171
+ end
@@ -0,0 +1,26 @@
1
+ module FreeScrape
2
+ class ItemLink
3
+
4
+ # Title of the item
5
+ attr_reader :title
6
+
7
+ # URL of the item
8
+ attr_reader :url
9
+
10
+ #
11
+ # Creates a new ItemLink with the specified _title_ and _url_.
12
+ #
13
+ def initialize(title,url)
14
+ @title = title
15
+ @url = url
16
+ end
17
+
18
+ #
19
+ # Returns the title of the item-link in +String+ form.
20
+ #
21
+ def to_s
22
+ @title.to_s
23
+ end
24
+
25
+ end
26
+ end
@@ -0,0 +1,3 @@
1
+ module FreeScrape
2
+ VERSION = '0.1.0'
3
+ end
@@ -0,0 +1,33 @@
1
+ require 'spec_helper'
2
+
3
+ describe FreeScrape do
4
+ it "should have a version" do
5
+ FreeScrape.const_get('VERSION').should_not be_nil
6
+ end
7
+
8
+ it "should have a default language" do
9
+ FreeScrape.language.should_not be_nil
10
+ end
11
+
12
+ it "should have a default User-Agent string" do
13
+ FreeScrape.user_agent.should_not be_nil
14
+ end
15
+
16
+ it "should return an item from a given URL" do
17
+ @item = FreeScrape.item('http://www.freebase.com/view/en/squarepusher/')
18
+ @item.should_not be_nil
19
+ @item.name.should == 'Squarepusher'
20
+ end
21
+
22
+ it "should return an item from a given GUID" do
23
+ @item = FreeScrape.item('9202a8c04000641f8000000000184c7a')
24
+ @item.should_not be_nil
25
+ @item.name.should == 'Conflict'
26
+ end
27
+
28
+ it "should return an item for a given title" do
29
+ @item = FreeScrape.item('Aphex Twin')
30
+ @item.should_not be_nil
31
+ @item.name.should == 'Aphex Twin'
32
+ end
33
+ end
@@ -0,0 +1,34 @@
1
+ require 'spec_helper'
2
+
3
+ describe Item do
4
+ before(:all) do
5
+ @item = Item.from_url('http://www.freebase.com/view/en/aphex_twin')
6
+ end
7
+
8
+ it "should have a URL" do
9
+ @item.url.should_not be_nil
10
+ end
11
+
12
+ it "should have a name" do
13
+ @item.name.should_not be_nil
14
+ @item.name.should_not be_empty
15
+ end
16
+
17
+ it "should have categories" do
18
+ @item.categories.should_not be_empty
19
+ end
20
+
21
+ it "should have category names" do
22
+ @item.category_names.should_not be_empty
23
+ end
24
+
25
+ it "should have metadata for each category" do
26
+ @item.categories.each_value do |category|
27
+ category.metadata.should_not be_empty
28
+
29
+ category.metadata.each_value do |data|
30
+ data.should_not be_nil
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,7 @@
1
+ require 'rubygems'
2
+ gem 'rspec', '>=1.1.3'
3
+ require 'spec'
4
+
5
+ require 'free_scrape'
6
+
7
+ include FreeScrape
@@ -0,0 +1,7 @@
1
+ require 'spec/rake/spectask'
2
+
3
+ desc "Run all specifications"
4
+ Spec::Rake::SpecTask.new(:spec) do |t|
5
+ t.libs += ['lib', 'spec']
6
+ t.spec_opts = ['--colour', '--format', 'specdoc']
7
+ end
metadata ADDED
@@ -0,0 +1,99 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: free-scrape
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Postmodern Modulus III
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-09-29 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hpricot
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: mechanize
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: "0"
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: hoe
37
+ type: :development
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: 1.7.0
44
+ version:
45
+ description: A web-scraping interface to freebase.com, the open and shared database of the world's knowledge.
46
+ email:
47
+ - postmodern.mod3@gmail.com
48
+ executables: []
49
+
50
+ extensions: []
51
+
52
+ extra_rdoc_files:
53
+ - History.txt
54
+ - Manifest.txt
55
+ - README.txt
56
+ files:
57
+ - History.txt
58
+ - Manifest.txt
59
+ - README.txt
60
+ - Rakefile
61
+ - lib/free_scrape.rb
62
+ - lib/free_scrape/item_link.rb
63
+ - lib/free_scrape/category.rb
64
+ - lib/free_scrape/item.rb
65
+ - lib/free_scrape/free_scrape.rb
66
+ - lib/free_scrape/version.rb
67
+ - tasks/spec.rb
68
+ - spec/item_spec.rb
69
+ - spec/free_scrape_spec.rb
70
+ - spec/spec_helper.rb
71
+ has_rdoc: true
72
+ homepage: http://freebase.rubyforge.org/
73
+ post_install_message:
74
+ rdoc_options:
75
+ - --main
76
+ - README.txt
77
+ require_paths:
78
+ - lib
79
+ required_ruby_version: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ version: "0"
84
+ version:
85
+ required_rubygems_version: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: "0"
90
+ version:
91
+ requirements: []
92
+
93
+ rubyforge_project: freebase
94
+ rubygems_version: 1.2.0
95
+ signing_key:
96
+ specification_version: 2
97
+ summary: A web-scraping interface to freebase.com, the open and shared database of the world's knowledge.
98
+ test_files: []
99
+