gscraper 0.2.4 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. data/.gitignore +8 -0
  2. data/.specopts +1 -0
  3. data/.yardopts +1 -0
  4. data/ChangeLog.md +122 -0
  5. data/Gemfile +25 -0
  6. data/{README.txt → README.md} +25 -24
  7. data/Rakefile +32 -10
  8. data/gscraper.gemspec +112 -0
  9. data/lib/gscraper.rb +0 -2
  10. data/lib/gscraper/extensions.rb +0 -2
  11. data/lib/gscraper/extensions/uri.rb +0 -2
  12. data/lib/gscraper/extensions/uri/http.rb +0 -2
  13. data/lib/gscraper/extensions/uri/query_params.rb +18 -5
  14. data/lib/gscraper/gscraper.rb +61 -70
  15. data/lib/gscraper/has_pages.rb +76 -20
  16. data/lib/gscraper/licenses.rb +0 -2
  17. data/lib/gscraper/page.rb +45 -16
  18. data/lib/gscraper/search.rb +0 -2
  19. data/lib/gscraper/search/ajax_query.rb +75 -22
  20. data/lib/gscraper/search/page.rb +328 -122
  21. data/lib/gscraper/search/query.rb +100 -7
  22. data/lib/gscraper/search/result.rb +27 -6
  23. data/lib/gscraper/search/search.rb +59 -9
  24. data/lib/gscraper/search/web_query.rb +120 -37
  25. data/lib/gscraper/sponsored_ad.rb +19 -6
  26. data/lib/gscraper/sponsored_links.rb +260 -92
  27. data/lib/gscraper/version.rb +2 -3
  28. data/spec/extensions/uri/query_params_spec.rb +8 -0
  29. data/spec/gscraper_spec.rb +9 -4
  30. data/spec/has_pages_examples.rb +0 -2
  31. data/spec/has_sponsored_links_examples.rb +2 -1
  32. data/spec/helpers/query.rb +3 -1
  33. data/spec/helpers/uri.rb +6 -4
  34. data/spec/page_has_results_examples.rb +0 -2
  35. data/spec/search/ajax_query_spec.rb +6 -11
  36. data/spec/search/page_has_results_examples.rb +0 -2
  37. data/spec/search/web_query_spec.rb +6 -11
  38. data/spec/spec_helper.rb +10 -4
  39. metadata +147 -54
  40. data/History.txt +0 -101
  41. data/Manifest.txt +0 -38
  42. data/tasks/spec.rb +0 -9
@@ -0,0 +1,8 @@
1
+ doc
2
+ pkg
3
+ tmp/*
4
+ .DS_Store
5
+ .bundle
6
+ .yardoc
7
+ *.swp
8
+ *~
@@ -0,0 +1 @@
1
+ --colour --format specdoc
@@ -0,0 +1 @@
1
+ --markup markdown --title 'GScraper Documentation' --protected --files ChangeLog.md,COPYING.txt
@@ -0,0 +1,122 @@
1
+ ### 0.3.0 / 2010-07-01
2
+
3
+ * Upgraded to mechanize ~> 1.0.0.
4
+ * Upgraded from json to json_pure ~> 1.4.0.
5
+ * Switched from Hoe to Jeweler for building RubyGems.
6
+ * Switched to Markdown documentation syntax with full YARD tags.
7
+ * Added support for `:allinanchor` and `:inanchor` options to
8
+ {GScraper::Search::Query}.
9
+ * Added support for the `:define` option in {GScraper::Search::Query}.
10
+ * Aliased {GScraper::Search::WebQuery#similar_to} to `related`.
11
+ * Aliased {GScraper::Search::WebQuery#similar_to=} to `related=`.
12
+ * Aliased {GScraper::Search::WebQuery#links_to} to `link`.
13
+ * Aliased {GScraper::Search::WebQuery#links_to=} to `link=`.
14
+ * Removed `GScraper.open_uri`.
15
+ * Removed `GScraper.open_page`.
16
+ * Fixed the escaping/unescaping of URL query params in {URI::QueryParams}.
17
+ * Use `yield` instead of `block.call`, when possible.
18
+ * All enumerable methods now return an `Enumerator` object, if no block was
19
+ given.
20
+
21
+ ### 0.2.4 / 2009-03-18
22
+
23
+ * Added {GScraper::SponsoredAd#direct_link}.
24
+ * Fixed a bug in {GScraper::SponsoredAd#direct_url} where direct links
25
+ were not being URI escaped.
26
+ * Removed last references to Hpricot, replacing them with Nokogiri.
27
+
28
+ ### 0.2.3 / 2009-01-27
29
+
30
+ * Fixed a bug in {GScraper::Search::WebQuery#page}, when the search query
31
+ returned less results than the expected results-per-page.
32
+
33
+ ### 0.2.2 / 2009-01-14
34
+
35
+ * Updated {GScraper::Search::WebQuery} to use Nokogiri properly.
36
+
37
+ ### 0.2.1 / 2008-08-27
38
+
39
+ * Updated XPath queries in {GScraper::Search::WebQuery} for new Google (tm)
40
+ Search Result HTML schema.
41
+
42
+ ### 0.2.0 / 2008-05-10
43
+
44
+ * Removed `GScraper::WebAgent`.
45
+ * Added {GScraper::Page} and {GScraper::HasPages}.
46
+ * {GScraper::Search::Result#page} and {GScraper::Search::Result#cached_page}
47
+ no longer receives blocks.
48
+ * Added `GScraper::Search::Query` which supports building query expressions.
49
+ * {GScraper::SponsoredLinks#initialize} and {GScraper::Page#initialize}
50
+ now take blocks.
51
+ * Renamed `GScraper::Search::Query` to {GScraper::Search::WebQuery}.
52
+ * {GScraper::Search::WebQuery#page} and
53
+ {GScraper::Search::WebQuery#sponsored_links} no longer take blocks.
54
+ * Added {GScraper::Search::AJAXQuery}.
55
+ * Replaced Unit Tests with Rspec specifications.
56
+
57
+ ### 0.1.8 / 2008-04-30
58
+
59
+ * Added the {GScraper.user_agent_alias=} method.
60
+ * Added {URI::HTTP::QueryParams} module.
61
+ * Changed license from MIT to GPL-2.
62
+
63
+ ### 0.1.7 / 2008-04-28
64
+
65
+ * Added support for specifing Search modifiers.
66
+
67
+ Search.query(:filetype => :xls)
68
+
69
+ * Added the {GScraper::Search::Result#page} method.
70
+
71
+ ### 0.1.6 / 2008-03-15
72
+
73
+ * Renamed `GScraper.http_agent` to {GScraper.web_agent}.
74
+ * Added {GScraper.proxy} for global proxy configuration.
75
+ * Added the `WebAgent` module.
76
+ * Renamed `Search::Query#first_result` to `Search::Query#top_result`.
77
+ * Updated `Search::Query#page` logic for the new DOM layout being used.
78
+ * Added support for Sponsored Ad scraping.
79
+ * Added the methods `Query#sponsored_links` and
80
+ `Query#top_sponsored_link`.
81
+ * Added examples to README.txt.
82
+
83
+ ### 0.1.5 / 2007-12-29
84
+
85
+ * Fixed class inheritance in `gscraper/extensions/uri/http.rb`, found by
86
+ sanitybit.
87
+
88
+ ### 0.1.4 / 2007-12-23
89
+
90
+ * Added `Search::Query#result_at` for easier access of a single result at
91
+ a given index.
92
+ * Adding scraping of the Cached and Similar Pages URLs of Search
93
+ Results.
94
+ * Added methods to `Search::Page` for accessing cached URLs, cached pages,
95
+ similar query URLs and similar Queries in mass.
96
+ * Search::Query#page and `Search::Query#first_page` now can receive blocks.
97
+ * Improved the formating of URL query parameters.
98
+ * Added more unit-tests.
99
+ * Fixed scraping of Search Result summaries.
100
+ * Fixed various bugs in `Search::Query` uncovered during unit-testing.
101
+ * Fixed typos in the documentation for `Search::Page`.
102
+
103
+ ### 0.1.3 / 2007-12-22
104
+
105
+ * Added the `Search::Page` class, which contains many of convenance methods
106
+ for searching through the results within a Page.
107
+
108
+ ### 0.1.2 / 2007-12-22
109
+
110
+ * Fixed a bug related to extracting the correct content-rights from search
111
+ query URLs.
112
+ * Added {GScraper.user_agent_aliases}.
113
+
114
+ ### 0.1.1 / 2007-12-21
115
+
116
+ * Forgot to include `lib/gscraper/version.rb`.
117
+
118
+ ### 0.1.0 / 2007-12-20
119
+
120
+ * Initial release.
121
+ * Supports the Google Search service.
122
+
data/Gemfile ADDED
@@ -0,0 +1,25 @@
1
+ source 'https://rubygems.org'
2
+
3
+ group(:runtime) do
4
+ gem 'json_pure', '~> 1.4.0'
5
+ gem 'mechanize', '~> 1.0.0'
6
+ end
7
+
8
+ group(:development) do
9
+ gem 'bundler', '~> 0.9.19'
10
+ gem 'rake', '~> 0.8.7'
11
+ gem 'jeweler', '~> 1.4.0', :git => 'git://github.com/technicalpickles/jeweler.git'
12
+ end
13
+
14
+ group(:doc) do
15
+ case RUBY_PLATFORM
16
+ when 'java'
17
+ gem 'maruku', '~> 0.6.0'
18
+ else
19
+ gem 'rdiscount', '~> 1.6.3'
20
+ end
21
+
22
+ gem 'yard', '~> 0.5.3'
23
+ end
24
+
25
+ gem 'rspec', '~> 1.3.0', :group => [:development, :test]
@@ -1,14 +1,14 @@
1
- = GScraper
1
+ # GScraper
2
2
 
3
- * http://gscraper.rubyforge.org/
4
- * http://github.com/postmodern/gscraper/
3
+ * [github.com/postmodern/gscraper](http://github.com/postmodern/gscraper/)
4
+ * [github.com/postmodern/gscraper/issues](http://github.com/postmodern/gscraper/issues)
5
5
  * Postmodern (postmodern.mod3 at gmail.com)
6
6
 
7
- == DESCRIPTION:
7
+ ## Description
8
8
 
9
9
  GScraper is a web-scraping interface to various Google Services.
10
10
 
11
- == FEATURES/PROBLEMS:
11
+ ## Features
12
12
 
13
13
  * Supports the Google Search service.
14
14
  * Provides access to search results and ranks.
@@ -16,21 +16,21 @@ GScraper is a web-scraping interface to various Google Services.
16
16
  * Provides HTTP access with custom User-Agent strings.
17
17
  * Provides proxy settings for HTTP access.
18
18
 
19
- == REQUIREMENTS:
19
+ ## Requirements
20
20
 
21
- * mechanize >= 0.9.0
21
+ * [mechanize](http://mechanize.rubyforge.org/mechanize/) ~> 1.0.0
22
22
 
23
- == INSTALL:
23
+ ## Install
24
24
 
25
- $ sudo gem install gscraper
25
+ $ sudo gem install gscraper
26
26
 
27
- == EXAMPLES:
27
+ ## Examples
28
28
 
29
- * Basic query:
29
+ Basic query:
30
30
 
31
31
  q = GScraper::Search.query(:query => 'ruby')
32
32
 
33
- * Advanced query:
33
+ Advanced query:
34
34
 
35
35
  q = GScraper::Search.query(:query => 'ruby') do |q|
36
36
  q.without_words = 'is'
@@ -38,7 +38,7 @@ GScraper is a web-scraping interface to various Google Services.
38
38
  q.numeric_range = 2..10
39
39
  end
40
40
 
41
- * Queries from URLs:
41
+ Queries from URLs:
42
42
 
43
43
  q = GScraper::Search.query_from_url('http://www.google.com/search?as_q=ruby&as_epq=&as_oq=rails&as_ft=i&as_qdr=all&as_occt=body&as_rights=%28cc_publicdomain%7Ccc_attribute%7Ccc_sharealike%7Ccc_noncommercial%29.-%28cc_nonderived%29')
44
44
 
@@ -47,7 +47,7 @@ GScraper is a web-scraping interface to various Google Services.
47
47
  q.occurrs_within # => :title
48
48
  q.rights # => :cc_by_nc
49
49
 
50
- * Getting the search results:
50
+ Getting the search results:
51
51
 
52
52
  q.first_page.select do |result|
53
53
  result.title =~ /Blog/
@@ -61,8 +61,8 @@ GScraper is a web-scraping interface to various Google Services.
61
61
 
62
62
  q.top_result # => Result
63
63
 
64
- * A Result object contains the rank, title, summary, cahced URL, similiar
65
- query URL and link URL of the search result.
64
+ A Result object contains the rank, title, summary, cahced URL, similiar
65
+ query URL and link URL of the search result.
66
66
 
67
67
  page = q.page(2)
68
68
 
@@ -73,7 +73,7 @@ GScraper is a web-scraping interface to various Google Services.
73
73
  page.cached_pages # => [...]
74
74
  page.similar_queries # => [...]
75
75
 
76
- * Iterating over the search results:
76
+ Iterating over the search results:
77
77
 
78
78
  q.each_on_page(2) do |result|
79
79
  puts result.title
@@ -83,7 +83,7 @@ GScraper is a web-scraping interface to various Google Services.
83
83
  puts result.url
84
84
  end
85
85
 
86
- * Iterating over the data within the search results:
86
+ Iterating over the data within the search results:
87
87
 
88
88
  page.each_title do |title|
89
89
  puts title
@@ -93,7 +93,7 @@ GScraper is a web-scraping interface to various Google Services.
93
93
  puts text
94
94
  end
95
95
 
96
- * Selecting search results:
96
+ Selecting search results:
97
97
 
98
98
  page.results_with do |result|
99
99
  ((result.rank > 2) && (result.rank < 10))
@@ -101,30 +101,30 @@ GScraper is a web-scraping interface to various Google Services.
101
101
 
102
102
  page.results_with_title(/Ruby/i) # => [...]
103
103
 
104
- * Selecting data within the search results:
104
+ Selecting data within the search results:
105
105
 
106
106
  page.titles # => [...]
107
107
 
108
108
  page.summaries # => [...]
109
109
 
110
- * Selecting the data of search results based on the search result:
110
+ Selecting the data of search results based on the search result:
111
111
 
112
112
  page.urls_of do |result|
113
113
  result.description.length > 10
114
114
  end
115
115
 
116
- * Selecting the Sponsored Links of a Query:
116
+ Selecting the Sponsored Links of a Query:
117
117
 
118
118
  q.sponsored_links # => [...]
119
119
 
120
120
  q.top_sponsored_link # => SponsoredAd
121
121
 
122
- * Setting the User-Agent globally:
122
+ Setting the User-Agent globally:
123
123
 
124
124
  GScraper.user_agent # => nil
125
125
  GScraper.user_agent = 'Awesome Browser v1.2'
126
126
 
127
- == LICENSE:
127
+ ## License
128
128
 
129
129
  GScraper - A web-scraping interface to various Google Services.
130
130
 
@@ -143,3 +143,4 @@ GNU General Public License for more details.
143
143
  You should have received a copy of the GNU General Public License
144
144
  along with this program; if not, write to the Free Software
145
145
  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
146
+
data/Rakefile CHANGED
@@ -1,16 +1,38 @@
1
- # -*- ruby -*-
2
-
3
1
  require 'rubygems'
4
- require 'hoe'
2
+ require 'bundler'
3
+
4
+ begin
5
+ Bundler.setup(:development, :doc)
6
+ rescue Bundler::BundlerError => e
7
+ STDERR.puts e.message
8
+ STDERR.puts "Run `bundle install` to install missing gems"
9
+ exit e.status_code
10
+ end
5
11
 
6
- require './tasks/spec.rb'
12
+ require 'rake'
13
+ require 'jeweler'
7
14
  require './lib/gscraper/version.rb'
8
15
 
9
- Hoe.new('gscraper', GScraper::VERSION) do |p|
10
- p.rubyforge_name = 'gscraper'
11
- p.developer('Postmodern', 'postmodern.mod3@gmail.com')
12
- p.remote_rdoc_dir = ''
13
- p.extra_deps = [['mechanize', '>=0.9.0']]
16
+ Jeweler::Tasks.new do |gem|
17
+ gem.name = 'gscraper'
18
+ gem.version = GScraper::VERSION
19
+ gem.license = 'GPL-2'
20
+ gem.summary = %Q{GScraper is a web-scraping interface to various Google Services.}
21
+ gem.description = %Q{GScraper is a web-scraping interface to various Google Services.}
22
+ gem.email = 'postmodern.mod3@gmail.com'
23
+ gem.homepage = 'http://github.com/postmodern/gscraper'
24
+ gem.authors = ['Postmodern']
25
+ gem.has_rdoc = 'yard'
14
26
  end
15
27
 
16
- # vim: syntax=Ruby
28
+ require 'spec/rake/spectask'
29
+ Spec::Rake::SpecTask.new(:spec) do |spec|
30
+ spec.libs += ['lib', 'spec']
31
+ spec.spec_files = FileList['spec/**/*_spec.rb']
32
+ spec.spec_opts = ['--options', '.specopts']
33
+ end
34
+
35
+ task :default => :spec
36
+
37
+ require 'yard'
38
+ YARD::Rake::YardocTask.new
@@ -0,0 +1,112 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{gscraper}
8
+ s.version = "0.3.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Postmodern"]
12
+ s.date = %q{2010-07-02}
13
+ s.description = %q{GScraper is a web-scraping interface to various Google Services.}
14
+ s.email = %q{postmodern.mod3@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "ChangeLog.md",
17
+ "README.md"
18
+ ]
19
+ s.files = [
20
+ ".gitignore",
21
+ ".specopts",
22
+ ".yardopts",
23
+ "COPYING.txt",
24
+ "ChangeLog.md",
25
+ "Gemfile",
26
+ "README.md",
27
+ "Rakefile",
28
+ "gscraper.gemspec",
29
+ "lib/gscraper.rb",
30
+ "lib/gscraper/extensions.rb",
31
+ "lib/gscraper/extensions/uri.rb",
32
+ "lib/gscraper/extensions/uri/http.rb",
33
+ "lib/gscraper/extensions/uri/query_params.rb",
34
+ "lib/gscraper/gscraper.rb",
35
+ "lib/gscraper/has_pages.rb",
36
+ "lib/gscraper/licenses.rb",
37
+ "lib/gscraper/page.rb",
38
+ "lib/gscraper/search.rb",
39
+ "lib/gscraper/search/ajax_query.rb",
40
+ "lib/gscraper/search/page.rb",
41
+ "lib/gscraper/search/query.rb",
42
+ "lib/gscraper/search/result.rb",
43
+ "lib/gscraper/search/search.rb",
44
+ "lib/gscraper/search/web_query.rb",
45
+ "lib/gscraper/sponsored_ad.rb",
46
+ "lib/gscraper/sponsored_links.rb",
47
+ "lib/gscraper/version.rb",
48
+ "spec/extensions/uri/http_spec.rb",
49
+ "spec/extensions/uri/query_params_spec.rb",
50
+ "spec/gscraper_spec.rb",
51
+ "spec/has_pages_examples.rb",
52
+ "spec/has_sponsored_links_examples.rb",
53
+ "spec/helpers/query.rb",
54
+ "spec/helpers/uri.rb",
55
+ "spec/page_has_results_examples.rb",
56
+ "spec/search/ajax_query_spec.rb",
57
+ "spec/search/page_has_results_examples.rb",
58
+ "spec/search/query_spec.rb",
59
+ "spec/search/web_query_spec.rb",
60
+ "spec/spec_helper.rb"
61
+ ]
62
+ s.has_rdoc = %q{yard}
63
+ s.homepage = %q{http://github.com/postmodern/gscraper}
64
+ s.licenses = ["GPL-2"]
65
+ s.require_paths = ["lib"]
66
+ s.rubygems_version = %q{1.3.7}
67
+ s.summary = %q{GScraper is a web-scraping interface to various Google Services.}
68
+ s.test_files = [
69
+ "spec/extensions/uri/http_spec.rb",
70
+ "spec/extensions/uri/query_params_spec.rb",
71
+ "spec/gscraper_spec.rb",
72
+ "spec/has_pages_examples.rb",
73
+ "spec/has_sponsored_links_examples.rb",
74
+ "spec/helpers/query.rb",
75
+ "spec/helpers/uri.rb",
76
+ "spec/page_has_results_examples.rb",
77
+ "spec/search/ajax_query_spec.rb",
78
+ "spec/search/page_has_results_examples.rb",
79
+ "spec/search/query_spec.rb",
80
+ "spec/search/web_query_spec.rb",
81
+ "spec/spec_helper.rb"
82
+ ]
83
+
84
+ if s.respond_to? :specification_version then
85
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
86
+ s.specification_version = 3
87
+
88
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
89
+ s.add_runtime_dependency(%q<json_pure>, ["~> 1.4.0"])
90
+ s.add_runtime_dependency(%q<mechanize>, ["~> 1.0.0"])
91
+ s.add_development_dependency(%q<bundler>, ["~> 0.9.19"])
92
+ s.add_development_dependency(%q<rake>, ["~> 0.8.7"])
93
+ s.add_development_dependency(%q<jeweler>, ["~> 1.4.0"])
94
+ s.add_development_dependency(%q<rspec>, ["~> 1.3.0"])
95
+ else
96
+ s.add_dependency(%q<json_pure>, ["~> 1.4.0"])
97
+ s.add_dependency(%q<mechanize>, ["~> 1.0.0"])
98
+ s.add_dependency(%q<bundler>, ["~> 0.9.19"])
99
+ s.add_dependency(%q<rake>, ["~> 0.8.7"])
100
+ s.add_dependency(%q<jeweler>, ["~> 1.4.0"])
101
+ s.add_dependency(%q<rspec>, ["~> 1.3.0"])
102
+ end
103
+ else
104
+ s.add_dependency(%q<json_pure>, ["~> 1.4.0"])
105
+ s.add_dependency(%q<mechanize>, ["~> 1.0.0"])
106
+ s.add_dependency(%q<bundler>, ["~> 0.9.19"])
107
+ s.add_dependency(%q<rake>, ["~> 0.8.7"])
108
+ s.add_dependency(%q<jeweler>, ["~> 1.4.0"])
109
+ s.add_dependency(%q<rspec>, ["~> 1.3.0"])
110
+ end
111
+ end
112
+