gsearch-parser 0.2.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +30 -2
- data/VERSION +1 -1
- data/gsearch-parser.gemspec +2 -2
- data/lib/gsearch-parser.rb +68 -33
- metadata +2 -2
data/README.md
CHANGED
@@ -1,6 +1,7 @@
|
|
1
|
-
#
|
1
|
+
# GSearchParser
|
2
2
|
|
3
3
|
TODO: Write a gem description
|
4
|
+
GSearchParser is a lightweight framework for making Google search queries and parsing the resulting pages. More parsed results can be requested by simply calling the 'nextResults' method.
|
4
5
|
|
5
6
|
## Installation
|
6
7
|
|
@@ -18,7 +19,34 @@ Or install it yourself as:
|
|
18
19
|
|
19
20
|
## Usage
|
20
21
|
|
21
|
-
|
22
|
+
require 'gsearch-parser'
|
23
|
+
|
24
|
+
# Create a new Google web search from a query string
|
25
|
+
webSearch = GSearchParser.webSearch('what')
|
26
|
+
|
27
|
+
# Iterate over results
|
28
|
+
webSearch.each do |result|
|
29
|
+
puts "\t" + result.title
|
30
|
+
puts "\t" + result.content
|
31
|
+
puts "\t" + result.uri
|
32
|
+
puts "\n"
|
33
|
+
end
|
34
|
+
|
35
|
+
# Fetch the next set of results, and iterate over them
|
36
|
+
webSearch.nextResults.each do |result|
|
37
|
+
puts "\t" + result.title
|
38
|
+
puts "\t" + result.content
|
39
|
+
puts "\t" + result.uri
|
40
|
+
puts "\n"
|
41
|
+
end
|
42
|
+
|
43
|
+
# Iterate over all the results, including the ones from calls to .nextResults
|
44
|
+
webSearch.each do |result|
|
45
|
+
puts "\t" + result.title
|
46
|
+
puts "\t" + result.content
|
47
|
+
puts "\t" + result.uri
|
48
|
+
puts "\n"
|
49
|
+
end
|
22
50
|
|
23
51
|
## Contributing
|
24
52
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.3.0
|
data/gsearch-parser.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "gsearch-parser"
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.3.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Diego Netto"]
|
12
|
-
s.date = "2012-04-
|
12
|
+
s.date = "2012-04-15"
|
13
13
|
s.description = "Queries Google search and parses the resulting web page for content."
|
14
14
|
s.email = "diegormnetto@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
data/lib/gsearch-parser.rb
CHANGED
@@ -1,38 +1,61 @@
|
|
1
|
-
require "gsearch-parser/version"
|
2
1
|
require 'open-uri'
|
3
2
|
require 'nokogiri'
|
4
3
|
|
4
|
+
#
|
5
|
+
# Module method definitions
|
6
|
+
#
|
5
7
|
module GSearchParser
|
6
8
|
|
9
|
+
# Entry method for performing a web search
|
7
10
|
def GSearchParser.webSearch(query)
|
8
|
-
GoogleWebSearch.new(query)
|
11
|
+
webSearch = GoogleWebSearch.new(query)
|
9
12
|
end
|
10
13
|
|
11
14
|
end
|
12
15
|
|
13
|
-
|
14
|
-
#
|
15
|
-
#
|
16
|
-
# #
|
17
|
-
###################################################
|
16
|
+
#
|
17
|
+
# Google Web Search class
|
18
|
+
#
|
18
19
|
class GoogleWebSearch
|
19
|
-
attr_accessor :results
|
20
|
-
|
20
|
+
attr_accessor :results, :currentPage
|
21
|
+
@index
|
22
|
+
|
21
23
|
# Class initializer
|
22
24
|
def initialize(query)
|
23
|
-
# Initialize
|
25
|
+
# Initialize variables
|
24
26
|
@results = Array.new
|
27
|
+
@index = 0
|
28
|
+
|
29
|
+
# Update the results list: (Fetch, Store, and Parse)
|
30
|
+
updateResults("http://google.com/search?sourceid=chrome&q=#{query}")
|
31
|
+
end
|
32
|
+
|
33
|
+
# Update the WebSearch results array by performing a Fetch, Store, Parse routine
|
34
|
+
def updateResults(url)
|
35
|
+
# Fetch
|
36
|
+
searchPage = fetchPage(url)
|
25
37
|
|
26
|
-
#
|
38
|
+
# Store
|
39
|
+
@currentPage = searchPage
|
40
|
+
|
41
|
+
# Parse
|
42
|
+
parseCurrentPage
|
43
|
+
end
|
44
|
+
|
45
|
+
# Fetch the page from a URL
|
46
|
+
def fetchPage(url)
|
47
|
+
Nokogiri::HTML(open(url, 'User-Agent' => 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.152 Safari/535.19'))
|
48
|
+
end
|
27
49
|
|
28
|
-
|
29
|
-
|
30
|
-
|
50
|
+
# Parse the current page and populate results
|
51
|
+
def parseCurrentPage
|
52
|
+
# Initialize local variables
|
53
|
+
currentResults = Array.new
|
31
54
|
|
32
55
|
# Iterate over each Google result list element
|
33
|
-
|
56
|
+
@currentPage.css('li.g').each do |result|
|
34
57
|
# Extract the title
|
35
|
-
title = result.css('h3
|
58
|
+
title = result.css('h3.r a').first.inner_html
|
36
59
|
|
37
60
|
# Extract the content. There is the possibility for
|
38
61
|
# the content to be nil, so check for this
|
@@ -47,8 +70,22 @@ class GoogleWebSearch
|
|
47
70
|
end
|
48
71
|
|
49
72
|
# Create a new Result object and append to the array
|
50
|
-
|
73
|
+
currentResults << Result.new(title, content, uri)
|
51
74
|
end
|
75
|
+
@results += currentResults
|
76
|
+
return currentResults
|
77
|
+
end
|
78
|
+
|
79
|
+
# Parse the results from the next page and append to results list
|
80
|
+
def nextResults
|
81
|
+
# Parse next result page link
|
82
|
+
nextPageUrl = @currentPage.css("table#nav tr td a")[@index]['href']
|
83
|
+
|
84
|
+
# Increment reference index
|
85
|
+
@index += 1
|
86
|
+
|
87
|
+
# Update results
|
88
|
+
updateResults("http://www.google.com" + nextPageUrl)
|
52
89
|
end
|
53
90
|
|
54
91
|
# Iterator over results
|
@@ -56,22 +93,20 @@ class GoogleWebSearch
|
|
56
93
|
@results.each(&blk)
|
57
94
|
end
|
58
95
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
# Class initializer
|
68
|
-
def initialize(title, content, uri)
|
69
|
-
@title = title
|
70
|
-
@content = content
|
71
|
-
@uri = uri
|
72
|
-
end
|
96
|
+
end # GoogleWebSearch
|
97
|
+
|
98
|
+
#
|
99
|
+
# Result class
|
100
|
+
#
|
101
|
+
class Result
|
102
|
+
attr_accessor :title, :content, :uri
|
73
103
|
|
74
|
-
|
104
|
+
# Class initializer
|
105
|
+
def initialize(title, content, uri)
|
106
|
+
@title = title
|
107
|
+
@content = content
|
108
|
+
@uri = uri
|
109
|
+
end
|
75
110
|
|
76
|
-
end #
|
111
|
+
end # Result
|
77
112
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gsearch-parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-04-
|
12
|
+
date: 2012-04-15 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|