google_parser 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +35 -0
- data/lib/google_parser/version.rb +5 -0
- data/lib/google_parser.rb +71 -0
- metadata +63 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 76ced4f6c307e04166d79f3cdff8eef98cbf76cf3668d8a1945edd0b9185c1c9
|
4
|
+
data.tar.gz: 231973730cea75a1be4698cbce1e0f537f4c56f698892e522c96cefd9536d0bd
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: adec3d65b94f63cdd17bf3197d84a31361a215549c0774dfee58aa8b053bab3a760d2d73defee81ada1273c56460b4957aae7589eeb1610505d7cc39decfcd49
|
7
|
+
data.tar.gz: a65ecb950d44c8bfc1510a7bc0aa8e3838709b038a7248c85d323bef5747e97a9e21d7af3e987a019b0fdb491040c8fbeb6c2f593f120309d5a937e5ce097973
|
data/README.md
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
# GoogleParser
|
2
|
+
|
3
|
+
TODO: Delete this and the text below, and describe your gem
|
4
|
+
|
5
|
+
Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/google_parser`. To experiment with that code, run `bin/console` for an interactive prompt.
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
TODO: Replace `UPDATE_WITH_YOUR_GEM_NAME_PRIOR_TO_RELEASE_TO_RUBYGEMS_ORG` with your gem name right after releasing it to RubyGems.org. Please do not do it earlier due to security reasons. Alternatively, replace this section with instructions to install your gem from git if you don't plan to release to RubyGems.org.
|
10
|
+
|
11
|
+
Install the gem and add to the application's Gemfile by executing:
|
12
|
+
|
13
|
+
$ bundle add UPDATE_WITH_YOUR_GEM_NAME_PRIOR_TO_RELEASE_TO_RUBYGEMS_ORG
|
14
|
+
|
15
|
+
If bundler is not being used to manage dependencies, install the gem by executing:
|
16
|
+
|
17
|
+
$ gem install UPDATE_WITH_YOUR_GEM_NAME_PRIOR_TO_RELEASE_TO_RUBYGEMS_ORG
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
TODO: Write usage instructions here
|
22
|
+
|
23
|
+
## Development
|
24
|
+
|
25
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
26
|
+
|
27
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
28
|
+
|
29
|
+
## Contributing
|
30
|
+
|
31
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/google_parser.
|
32
|
+
|
33
|
+
## License
|
34
|
+
|
35
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
@@ -0,0 +1,71 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "google_parser/version"
|
4
|
+
|
5
|
+
require "nokogiri"
|
6
|
+
|
7
|
+
module GoogleParser
|
8
|
+
class InvalidJsModelError < StandardError; end
|
9
|
+
|
10
|
+
class GoogleParser
|
11
|
+
attr_reader :raw_html, :doc, :organic_results, :jsmodel
|
12
|
+
|
13
|
+
def initialize(raw_html)
|
14
|
+
@raw_html = raw_html
|
15
|
+
@doc = Nokogiri::HTML(raw_html)
|
16
|
+
@organic_results = []
|
17
|
+
@jsmodel = @doc.css("body").attr("jsmodel").value
|
18
|
+
|
19
|
+
parse_organic_results
|
20
|
+
rescue StandardError => e
|
21
|
+
raise "Could not parse HTML"
|
22
|
+
end
|
23
|
+
|
24
|
+
def selectors
|
25
|
+
@selectors ||= {
|
26
|
+
"hspDDf" => {
|
27
|
+
organic_results: {
|
28
|
+
container: "div.Gx5Zad.fP1Qef.xpd.EtOod.pkphOe",
|
29
|
+
title: "h3 > div",
|
30
|
+
description: "div.BNeawe.s3v9rd.AP7Wnd",
|
31
|
+
breadcrumbs: "div.BNeawe.UPmit.AP7Wnd.lRVwie",
|
32
|
+
url: "a[href]",
|
33
|
+
}
|
34
|
+
}
|
35
|
+
}[@jsmodel] || raise("Could not find selectors for jsmodel: \"#{@jsmodel}\"")
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
def parse_organic_results
|
40
|
+
result_elements = @doc.css(selectors.dig(:organic_results, :container))
|
41
|
+
@organic_results = result_elements.map do |result_element|
|
42
|
+
url = parse_google_url(result_element.css(selectors.dig(:organic_results, :url)))
|
43
|
+
domain = extract_domain(url)
|
44
|
+
root_domain = domain.gsub("www.", "")
|
45
|
+
{
|
46
|
+
position: result_elements.index(result_element) + 1,
|
47
|
+
title: result_element.css(selectors.dig(:organic_results, :title)).text&.strip,
|
48
|
+
description: result_element.css(selectors.dig(:organic_results, :description)).text&.strip,
|
49
|
+
breadcrumbs: result_element.css(selectors.dig(:organic_results, :breadcrumbs)).text&.strip,
|
50
|
+
url: url,
|
51
|
+
domain: domain,
|
52
|
+
root_domain: root_domain
|
53
|
+
}
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def extract_domain(url)
|
58
|
+
URI.parse(url).host
|
59
|
+
end
|
60
|
+
|
61
|
+
def parse_google_url(full_google_uri)
|
62
|
+
href = full_google_uri.attr("href").value
|
63
|
+
if href.start_with?("/url?q=")
|
64
|
+
href.match(/\/url\?q=(.*?)&/)[1]
|
65
|
+
else
|
66
|
+
href
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
# Your code goes here...
|
71
|
+
end
|
metadata
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: google_parser
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Rasmus Kjellberg
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2023-08-06 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 1.15.3
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 1.15.3
|
27
|
+
description: Write a longer description or delete this line.
|
28
|
+
email:
|
29
|
+
- 2277443+kjellberg@users.noreply.github.com
|
30
|
+
executables: []
|
31
|
+
extensions: []
|
32
|
+
extra_rdoc_files: []
|
33
|
+
files:
|
34
|
+
- README.md
|
35
|
+
- lib/google_parser.rb
|
36
|
+
- lib/google_parser/version.rb
|
37
|
+
homepage: https://rankzon.com
|
38
|
+
licenses:
|
39
|
+
- MIT
|
40
|
+
metadata:
|
41
|
+
homepage_uri: https://rankzon.com
|
42
|
+
source_code_uri: https://github.com/kjellberg/google_parser
|
43
|
+
changelog_uri: https://github.com/kjellberg/google_parser/blob/master/CHANGELOG.md
|
44
|
+
post_install_message:
|
45
|
+
rdoc_options: []
|
46
|
+
require_paths:
|
47
|
+
- lib
|
48
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: 2.6.0
|
53
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
54
|
+
requirements:
|
55
|
+
- - ">="
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: '0'
|
58
|
+
requirements: []
|
59
|
+
rubygems_version: 3.4.10
|
60
|
+
signing_key:
|
61
|
+
specification_version: 4
|
62
|
+
summary: Write a short summary, because RubyGems requires one.
|
63
|
+
test_files: []
|