gscraper 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. data/.gitignore +8 -0
  2. data/.specopts +1 -0
  3. data/.yardopts +1 -0
  4. data/ChangeLog.md +122 -0
  5. data/Gemfile +25 -0
  6. data/{README.txt → README.md} +25 -24
  7. data/Rakefile +32 -10
  8. data/gscraper.gemspec +112 -0
  9. data/lib/gscraper.rb +0 -2
  10. data/lib/gscraper/extensions.rb +0 -2
  11. data/lib/gscraper/extensions/uri.rb +0 -2
  12. data/lib/gscraper/extensions/uri/http.rb +0 -2
  13. data/lib/gscraper/extensions/uri/query_params.rb +18 -5
  14. data/lib/gscraper/gscraper.rb +61 -70
  15. data/lib/gscraper/has_pages.rb +76 -20
  16. data/lib/gscraper/licenses.rb +0 -2
  17. data/lib/gscraper/page.rb +45 -16
  18. data/lib/gscraper/search.rb +0 -2
  19. data/lib/gscraper/search/ajax_query.rb +75 -22
  20. data/lib/gscraper/search/page.rb +328 -122
  21. data/lib/gscraper/search/query.rb +100 -7
  22. data/lib/gscraper/search/result.rb +27 -6
  23. data/lib/gscraper/search/search.rb +59 -9
  24. data/lib/gscraper/search/web_query.rb +120 -37
  25. data/lib/gscraper/sponsored_ad.rb +19 -6
  26. data/lib/gscraper/sponsored_links.rb +260 -92
  27. data/lib/gscraper/version.rb +2 -3
  28. data/spec/extensions/uri/query_params_spec.rb +8 -0
  29. data/spec/gscraper_spec.rb +9 -4
  30. data/spec/has_pages_examples.rb +0 -2
  31. data/spec/has_sponsored_links_examples.rb +2 -1
  32. data/spec/helpers/query.rb +3 -1
  33. data/spec/helpers/uri.rb +6 -4
  34. data/spec/page_has_results_examples.rb +0 -2
  35. data/spec/search/ajax_query_spec.rb +6 -11
  36. data/spec/search/page_has_results_examples.rb +0 -2
  37. data/spec/search/web_query_spec.rb +6 -11
  38. data/spec/spec_helper.rb +10 -4
  39. metadata +147 -54
  40. data/History.txt +0 -101
  41. data/Manifest.txt +0 -38
  42. data/tasks/spec.rb +0 -9
@@ -0,0 +1,8 @@
1
+ doc
2
+ pkg
3
+ tmp/*
4
+ .DS_Store
5
+ .bundle
6
+ .yardoc
7
+ *.swp
8
+ *~
@@ -0,0 +1 @@
1
+ --colour --format specdoc
@@ -0,0 +1 @@
1
+ --markup markdown --title 'GScraper Documentation' --protected --files ChangeLog.md,COPYING.txt
@@ -0,0 +1,122 @@
1
+ ### 0.3.0 / 2010-07-01
2
+
3
+ * Upgraded to mechanize ~> 1.0.0.
4
+ * Upgraded from json to json_pure ~> 1.4.0.
5
+ * Switched from Hoe to Jeweler for building RubyGems.
6
+ * Switched to Markdown documentation syntax with full YARD tags.
7
+ * Added support for `:allinanchor` and `:inanchor` options to
8
+ {GScraper::Search::Query}.
9
+ * Added support for the `:define` option in {GScraper::Search::Query}.
10
+ * Aliased {GScraper::Search::WebQuery#similar_to} to `related`.
11
+ * Aliased {GScraper::Search::WebQuery#similar_to=} to `related=`.
12
+ * Aliased {GScraper::Search::WebQuery#links_to} to `link`.
13
+ * Aliased {GScraper::Search::WebQuery#links_to=} to `link=`.
14
+ * Removed `GScraper.open_uri`.
15
+ * Removed `GScraper.open_page`.
16
+ * Fixed the escaping/unescaping of URL query params in {URI::QueryParams}.
17
+ * Use `yield` instead of `block.call`, when possible.
18
+ * All enumerable methods now return an `Enumerator` object, if no block was
19
+ given.
20
+
21
+ ### 0.2.4 / 2009-03-18
22
+
23
+ * Added {GScraper::SponsoredAd#direct_link}.
24
+ * Fixed a bug in {GScraper::SponsoredAd#direct_url} where direct links
25
+ were not being URI escaped.
26
+ * Removed last references to Hpricot, replacing them with Nokogiri.
27
+
28
+ ### 0.2.3 / 2009-01-27
29
+
30
+ * Fixed a bug in {GScraper::Search::WebQuery#page}, when the search query
31
+ returned less results than the expected results-per-page.
32
+
33
+ ### 0.2.2 / 2009-01-14
34
+
35
+ * Updated {GScraper::Search::WebQuery} to use Nokogiri properly.
36
+
37
+ ### 0.2.1 / 2008-08-27
38
+
39
+ * Updated XPath queries in {GScraper::Search::WebQuery} for new Google (tm)
40
+ Search Result HTML schema.
41
+
42
+ ### 0.2.0 / 2008-05-10
43
+
44
+ * Removed `GScraper::WebAgent`.
45
+ * Added {GScraper::Page} and {GScraper::HasPages}.
46
+ * {GScraper::Search::Result#page} and {GScraper::Search::Result#cached_page}
47
+ no longer receives blocks.
48
+ * Added `GScraper::Search::Query` which supports building query expressions.
49
+ * {GScraper::SponsoredLinks#initialize} and {GScraper::Page#initialize}
50
+ now take blocks.
51
+ * Renamed `GScraper::Search::Query` to {GScraper::Search::WebQuery}.
52
+ * {GScraper::Search::WebQuery#page} and
53
+ {GScraper::Search::WebQuery#sponsored_links} no longer take blocks.
54
+ * Added {GScraper::Search::AJAXQuery}.
55
+ * Replaced Unit Tests with Rspec specifications.
56
+
57
+ ### 0.1.8 / 2008-04-30
58
+
59
+ * Added the {GScraper.user_agent_alias=} method.
60
+ * Added {URI::HTTP::QueryParams} module.
61
+ * Changed license from MIT to GPL-2.
62
+
63
+ ### 0.1.7 / 2008-04-28
64
+
65
+ * Added support for specifing Search modifiers.
66
+
67
+ Search.query(:filetype => :xls)
68
+
69
+ * Added the {GScraper::Search::Result#page} method.
70
+
71
+ ### 0.1.6 / 2008-03-15
72
+
73
+ * Renamed `GScraper.http_agent` to {GScraper.web_agent}.
74
+ * Added {GScraper.proxy} for global proxy configuration.
75
+ * Added the `WebAgent` module.
76
+ * Renamed `Search::Query#first_result` to `Search::Query#top_result`.
77
+ * Updated `Search::Query#page` logic for the new DOM layout being used.
78
+ * Added support for Sponsored Ad scraping.
79
+ * Added the methods `Query#sponsored_links` and
80
+ `Query#top_sponsored_link`.
81
+ * Added examples to README.txt.
82
+
83
+ ### 0.1.5 / 2007-12-29
84
+
85
+ * Fixed class inheritance in `gscraper/extensions/uri/http.rb`, found by
86
+ sanitybit.
87
+
88
+ ### 0.1.4 / 2007-12-23
89
+
90
+ * Added `Search::Query#result_at` for easier access of a single result at
91
+ a given index.
92
+ * Adding scraping of the Cached and Similar Pages URLs of Search
93
+ Results.
94
+ * Added methods to `Search::Page` for accessing cached URLs, cached pages,
95
+ similar query URLs and similar Queries in mass.
96
+ * Search::Query#page and `Search::Query#first_page` now can receive blocks.
97
+ * Improved the formating of URL query parameters.
98
+ * Added more unit-tests.
99
+ * Fixed scraping of Search Result summaries.
100
+ * Fixed various bugs in `Search::Query` uncovered during unit-testing.
101
+ * Fixed typos in the documentation for `Search::Page`.
102
+
103
+ ### 0.1.3 / 2007-12-22
104
+
105
+ * Added the `Search::Page` class, which contains many of convenance methods
106
+ for searching through the results within a Page.
107
+
108
+ ### 0.1.2 / 2007-12-22
109
+
110
+ * Fixed a bug related to extracting the correct content-rights from search
111
+ query URLs.
112
+ * Added {GScraper.user_agent_aliases}.
113
+
114
+ ### 0.1.1 / 2007-12-21
115
+
116
+ * Forgot to include `lib/gscraper/version.rb`.
117
+
118
+ ### 0.1.0 / 2007-12-20
119
+
120
+ * Initial release.
121
+ * Supports the Google Search service.
122
+
data/Gemfile ADDED
@@ -0,0 +1,25 @@
1
+ source 'https://rubygems.org'
2
+
3
+ group(:runtime) do
4
+ gem 'json_pure', '~> 1.4.0'
5
+ gem 'mechanize', '~> 1.0.0'
6
+ end
7
+
8
+ group(:development) do
9
+ gem 'bundler', '~> 0.9.19'
10
+ gem 'rake', '~> 0.8.7'
11
+ gem 'jeweler', '~> 1.4.0', :git => 'git://github.com/technicalpickles/jeweler.git'
12
+ end
13
+
14
+ group(:doc) do
15
+ case RUBY_PLATFORM
16
+ when 'java'
17
+ gem 'maruku', '~> 0.6.0'
18
+ else
19
+ gem 'rdiscount', '~> 1.6.3'
20
+ end
21
+
22
+ gem 'yard', '~> 0.5.3'
23
+ end
24
+
25
+ gem 'rspec', '~> 1.3.0', :group => [:development, :test]
@@ -1,14 +1,14 @@
1
- = GScraper
1
+ # GScraper
2
2
 
3
- * http://gscraper.rubyforge.org/
4
- * http://github.com/postmodern/gscraper/
3
+ * [github.com/postmodern/gscraper](http://github.com/postmodern/gscraper/)
4
+ * [github.com/postmodern/gscraper/issues](http://github.com/postmodern/gscraper/issues)
5
5
  * Postmodern (postmodern.mod3 at gmail.com)
6
6
 
7
- == DESCRIPTION:
7
+ ## Description
8
8
 
9
9
  GScraper is a web-scraping interface to various Google Services.
10
10
 
11
- == FEATURES/PROBLEMS:
11
+ ## Features
12
12
 
13
13
  * Supports the Google Search service.
14
14
  * Provides access to search results and ranks.
@@ -16,21 +16,21 @@ GScraper is a web-scraping interface to various Google Services.
16
16
  * Provides HTTP access with custom User-Agent strings.
17
17
  * Provides proxy settings for HTTP access.
18
18
 
19
- == REQUIREMENTS:
19
+ ## Requirements
20
20
 
21
- * mechanize >= 0.9.0
21
+ * [mechanize](http://mechanize.rubyforge.org/mechanize/) ~> 1.0.0
22
22
 
23
- == INSTALL:
23
+ ## Install
24
24
 
25
- $ sudo gem install gscraper
25
+ $ sudo gem install gscraper
26
26
 
27
- == EXAMPLES:
27
+ ## Examples
28
28
 
29
- * Basic query:
29
+ Basic query:
30
30
 
31
31
  q = GScraper::Search.query(:query => 'ruby')
32
32
 
33
- * Advanced query:
33
+ Advanced query:
34
34
 
35
35
  q = GScraper::Search.query(:query => 'ruby') do |q|
36
36
  q.without_words = 'is'
@@ -38,7 +38,7 @@ GScraper is a web-scraping interface to various Google Services.
38
38
  q.numeric_range = 2..10
39
39
  end
40
40
 
41
- * Queries from URLs:
41
+ Queries from URLs:
42
42
 
43
43
  q = GScraper::Search.query_from_url('http://www.google.com/search?as_q=ruby&as_epq=&as_oq=rails&as_ft=i&as_qdr=all&as_occt=body&as_rights=%28cc_publicdomain%7Ccc_attribute%7Ccc_sharealike%7Ccc_noncommercial%29.-%28cc_nonderived%29')
44
44
 
@@ -47,7 +47,7 @@ GScraper is a web-scraping interface to various Google Services.
47
47
  q.occurrs_within # => :title
48
48
  q.rights # => :cc_by_nc
49
49
 
50
- * Getting the search results:
50
+ Getting the search results:
51
51
 
52
52
  q.first_page.select do |result|
53
53
  result.title =~ /Blog/
@@ -61,8 +61,8 @@ GScraper is a web-scraping interface to various Google Services.
61
61
 
62
62
  q.top_result # => Result
63
63
 
64
- * A Result object contains the rank, title, summary, cahced URL, similiar
65
- query URL and link URL of the search result.
64
+ A Result object contains the rank, title, summary, cahced URL, similiar
65
+ query URL and link URL of the search result.
66
66
 
67
67
  page = q.page(2)
68
68
 
@@ -73,7 +73,7 @@ GScraper is a web-scraping interface to various Google Services.
73
73
  page.cached_pages # => [...]
74
74
  page.similar_queries # => [...]
75
75
 
76
- * Iterating over the search results:
76
+ Iterating over the search results:
77
77
 
78
78
  q.each_on_page(2) do |result|
79
79
  puts result.title
@@ -83,7 +83,7 @@ GScraper is a web-scraping interface to various Google Services.
83
83
  puts result.url
84
84
  end
85
85
 
86
- * Iterating over the data within the search results:
86
+ Iterating over the data within the search results:
87
87
 
88
88
  page.each_title do |title|
89
89
  puts title
@@ -93,7 +93,7 @@ GScraper is a web-scraping interface to various Google Services.
93
93
  puts text
94
94
  end
95
95
 
96
- * Selecting search results:
96
+ Selecting search results:
97
97
 
98
98
  page.results_with do |result|
99
99
  ((result.rank > 2) && (result.rank < 10))
@@ -101,30 +101,30 @@ GScraper is a web-scraping interface to various Google Services.
101
101
 
102
102
  page.results_with_title(/Ruby/i) # => [...]
103
103
 
104
- * Selecting data within the search results:
104
+ Selecting data within the search results:
105
105
 
106
106
  page.titles # => [...]
107
107
 
108
108
  page.summaries # => [...]
109
109
 
110
- * Selecting the data of search results based on the search result:
110
+ Selecting the data of search results based on the search result:
111
111
 
112
112
  page.urls_of do |result|
113
113
  result.description.length > 10
114
114
  end
115
115
 
116
- * Selecting the Sponsored Links of a Query:
116
+ Selecting the Sponsored Links of a Query:
117
117
 
118
118
  q.sponsored_links # => [...]
119
119
 
120
120
  q.top_sponsored_link # => SponsoredAd
121
121
 
122
- * Setting the User-Agent globally:
122
+ Setting the User-Agent globally:
123
123
 
124
124
  GScraper.user_agent # => nil
125
125
  GScraper.user_agent = 'Awesome Browser v1.2'
126
126
 
127
- == LICENSE:
127
+ ## License
128
128
 
129
129
  GScraper - A web-scraping interface to various Google Services.
130
130
 
@@ -143,3 +143,4 @@ GNU General Public License for more details.
143
143
  You should have received a copy of the GNU General Public License
144
144
  along with this program; if not, write to the Free Software
145
145
  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
146
+
data/Rakefile CHANGED
@@ -1,16 +1,38 @@
1
- # -*- ruby -*-
2
-
3
1
  require 'rubygems'
4
- require 'hoe'
2
+ require 'bundler'
3
+
4
+ begin
5
+ Bundler.setup(:development, :doc)
6
+ rescue Bundler::BundlerError => e
7
+ STDERR.puts e.message
8
+ STDERR.puts "Run `bundle install` to install missing gems"
9
+ exit e.status_code
10
+ end
5
11
 
6
- require './tasks/spec.rb'
12
+ require 'rake'
13
+ require 'jeweler'
7
14
  require './lib/gscraper/version.rb'
8
15
 
9
- Hoe.new('gscraper', GScraper::VERSION) do |p|
10
- p.rubyforge_name = 'gscraper'
11
- p.developer('Postmodern', 'postmodern.mod3@gmail.com')
12
- p.remote_rdoc_dir = ''
13
- p.extra_deps = [['mechanize', '>=0.9.0']]
16
+ Jeweler::Tasks.new do |gem|
17
+ gem.name = 'gscraper'
18
+ gem.version = GScraper::VERSION
19
+ gem.license = 'GPL-2'
20
+ gem.summary = %Q{GScraper is a web-scraping interface to various Google Services.}
21
+ gem.description = %Q{GScraper is a web-scraping interface to various Google Services.}
22
+ gem.email = 'postmodern.mod3@gmail.com'
23
+ gem.homepage = 'http://github.com/postmodern/gscraper'
24
+ gem.authors = ['Postmodern']
25
+ gem.has_rdoc = 'yard'
14
26
  end
15
27
 
16
- # vim: syntax=Ruby
28
+ require 'spec/rake/spectask'
29
+ Spec::Rake::SpecTask.new(:spec) do |spec|
30
+ spec.libs += ['lib', 'spec']
31
+ spec.spec_files = FileList['spec/**/*_spec.rb']
32
+ spec.spec_opts = ['--options', '.specopts']
33
+ end
34
+
35
+ task :default => :spec
36
+
37
+ require 'yard'
38
+ YARD::Rake::YardocTask.new
@@ -0,0 +1,112 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{gscraper}
8
+ s.version = "0.3.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Postmodern"]
12
+ s.date = %q{2010-07-02}
13
+ s.description = %q{GScraper is a web-scraping interface to various Google Services.}
14
+ s.email = %q{postmodern.mod3@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "ChangeLog.md",
17
+ "README.md"
18
+ ]
19
+ s.files = [
20
+ ".gitignore",
21
+ ".specopts",
22
+ ".yardopts",
23
+ "COPYING.txt",
24
+ "ChangeLog.md",
25
+ "Gemfile",
26
+ "README.md",
27
+ "Rakefile",
28
+ "gscraper.gemspec",
29
+ "lib/gscraper.rb",
30
+ "lib/gscraper/extensions.rb",
31
+ "lib/gscraper/extensions/uri.rb",
32
+ "lib/gscraper/extensions/uri/http.rb",
33
+ "lib/gscraper/extensions/uri/query_params.rb",
34
+ "lib/gscraper/gscraper.rb",
35
+ "lib/gscraper/has_pages.rb",
36
+ "lib/gscraper/licenses.rb",
37
+ "lib/gscraper/page.rb",
38
+ "lib/gscraper/search.rb",
39
+ "lib/gscraper/search/ajax_query.rb",
40
+ "lib/gscraper/search/page.rb",
41
+ "lib/gscraper/search/query.rb",
42
+ "lib/gscraper/search/result.rb",
43
+ "lib/gscraper/search/search.rb",
44
+ "lib/gscraper/search/web_query.rb",
45
+ "lib/gscraper/sponsored_ad.rb",
46
+ "lib/gscraper/sponsored_links.rb",
47
+ "lib/gscraper/version.rb",
48
+ "spec/extensions/uri/http_spec.rb",
49
+ "spec/extensions/uri/query_params_spec.rb",
50
+ "spec/gscraper_spec.rb",
51
+ "spec/has_pages_examples.rb",
52
+ "spec/has_sponsored_links_examples.rb",
53
+ "spec/helpers/query.rb",
54
+ "spec/helpers/uri.rb",
55
+ "spec/page_has_results_examples.rb",
56
+ "spec/search/ajax_query_spec.rb",
57
+ "spec/search/page_has_results_examples.rb",
58
+ "spec/search/query_spec.rb",
59
+ "spec/search/web_query_spec.rb",
60
+ "spec/spec_helper.rb"
61
+ ]
62
+ s.has_rdoc = %q{yard}
63
+ s.homepage = %q{http://github.com/postmodern/gscraper}
64
+ s.licenses = ["GPL-2"]
65
+ s.require_paths = ["lib"]
66
+ s.rubygems_version = %q{1.3.7}
67
+ s.summary = %q{GScraper is a web-scraping interface to various Google Services.}
68
+ s.test_files = [
69
+ "spec/extensions/uri/http_spec.rb",
70
+ "spec/extensions/uri/query_params_spec.rb",
71
+ "spec/gscraper_spec.rb",
72
+ "spec/has_pages_examples.rb",
73
+ "spec/has_sponsored_links_examples.rb",
74
+ "spec/helpers/query.rb",
75
+ "spec/helpers/uri.rb",
76
+ "spec/page_has_results_examples.rb",
77
+ "spec/search/ajax_query_spec.rb",
78
+ "spec/search/page_has_results_examples.rb",
79
+ "spec/search/query_spec.rb",
80
+ "spec/search/web_query_spec.rb",
81
+ "spec/spec_helper.rb"
82
+ ]
83
+
84
+ if s.respond_to? :specification_version then
85
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
86
+ s.specification_version = 3
87
+
88
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
89
+ s.add_runtime_dependency(%q<json_pure>, ["~> 1.4.0"])
90
+ s.add_runtime_dependency(%q<mechanize>, ["~> 1.0.0"])
91
+ s.add_development_dependency(%q<bundler>, ["~> 0.9.19"])
92
+ s.add_development_dependency(%q<rake>, ["~> 0.8.7"])
93
+ s.add_development_dependency(%q<jeweler>, ["~> 1.4.0"])
94
+ s.add_development_dependency(%q<rspec>, ["~> 1.3.0"])
95
+ else
96
+ s.add_dependency(%q<json_pure>, ["~> 1.4.0"])
97
+ s.add_dependency(%q<mechanize>, ["~> 1.0.0"])
98
+ s.add_dependency(%q<bundler>, ["~> 0.9.19"])
99
+ s.add_dependency(%q<rake>, ["~> 0.8.7"])
100
+ s.add_dependency(%q<jeweler>, ["~> 1.4.0"])
101
+ s.add_dependency(%q<rspec>, ["~> 1.3.0"])
102
+ end
103
+ else
104
+ s.add_dependency(%q<json_pure>, ["~> 1.4.0"])
105
+ s.add_dependency(%q<mechanize>, ["~> 1.0.0"])
106
+ s.add_dependency(%q<bundler>, ["~> 0.9.19"])
107
+ s.add_dependency(%q<rake>, ["~> 0.8.7"])
108
+ s.add_dependency(%q<jeweler>, ["~> 1.4.0"])
109
+ s.add_dependency(%q<rspec>, ["~> 1.3.0"])
110
+ end
111
+ end
112
+