rubyscraper 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +57 -0
- data/README.md +39 -0
- data/Rakefile +1 -0
- data/bin/console +4 -0
- data/bin/rubyscraper +5 -0
- data/bin/setup +7 -0
- data/lib/assets/search-terms.txt +5 -0
- data/lib/rubyscraper.rb +90 -0
- data/lib/rubyscraper/binary.rb +17 -0
- data/lib/rubyscraper/version.rb +3 -0
- data/rubyscraper.gemspec +24 -0
- data/spec/rubyscraper_spec.rb +11 -0
- data/spec/spec_helper.rb +2 -0
- metadata +156 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 6f4cfba8b1442632b6c54f30886d254ff25fbbd3
|
4
|
+
data.tar.gz: 4bc8431e8294d5900819d29735f69650e93448da
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 73ff93065f3079602dfcc58e35e08686ce4fff2d693b7f80c63e8cd37f011826156731b7d8a42a84472232c4a0c5b18dcb8efde1e7a7ede2ded37c431d22f95f
|
7
|
+
data.tar.gz: 04e30fe957b8d95a25cb0620539eca3b8956eb1c5638c8bd038aa6b461f4868555889b033201b350dc651f5faff7e2e38c2fac86ea98ce4bf13ac76de21834c5
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
rubyscraper (0.1.0)
|
5
|
+
capybara
|
6
|
+
poltergeist
|
7
|
+
rest-client
|
8
|
+
|
9
|
+
GEM
|
10
|
+
remote: https://rubygems.org/
|
11
|
+
specs:
|
12
|
+
capybara (2.4.4)
|
13
|
+
mime-types (>= 1.16)
|
14
|
+
nokogiri (>= 1.3.3)
|
15
|
+
rack (>= 1.0.0)
|
16
|
+
rack-test (>= 0.5.4)
|
17
|
+
xpath (~> 2.0)
|
18
|
+
cliver (0.3.2)
|
19
|
+
domain_name (0.5.24)
|
20
|
+
unf (>= 0.0.5, < 1.0.0)
|
21
|
+
http-cookie (1.0.2)
|
22
|
+
domain_name (~> 0.5)
|
23
|
+
mime-types (2.4.3)
|
24
|
+
mini_portile (0.6.2)
|
25
|
+
multi_json (1.11.0)
|
26
|
+
netrc (0.10.3)
|
27
|
+
nokogiri (1.6.6.2)
|
28
|
+
mini_portile (~> 0.6.0)
|
29
|
+
poltergeist (1.6.0)
|
30
|
+
capybara (~> 2.1)
|
31
|
+
cliver (~> 0.3.1)
|
32
|
+
multi_json (~> 1.0)
|
33
|
+
websocket-driver (>= 0.2.0)
|
34
|
+
rack (1.6.0)
|
35
|
+
rack-test (0.6.3)
|
36
|
+
rack (>= 1.0)
|
37
|
+
rake (10.4.2)
|
38
|
+
rest-client (1.8.0)
|
39
|
+
http-cookie (>= 1.0.2, < 2.0)
|
40
|
+
mime-types (>= 1.16, < 3.0)
|
41
|
+
netrc (~> 0.7)
|
42
|
+
unf (0.1.4)
|
43
|
+
unf_ext
|
44
|
+
unf_ext (0.0.7.1)
|
45
|
+
websocket-driver (0.5.4)
|
46
|
+
websocket-extensions (>= 0.1.0)
|
47
|
+
websocket-extensions (0.1.2)
|
48
|
+
xpath (2.0.0)
|
49
|
+
nokogiri (~> 1.3)
|
50
|
+
|
51
|
+
PLATFORMS
|
52
|
+
ruby
|
53
|
+
|
54
|
+
DEPENDENCIES
|
55
|
+
bundler (~> 1.9)
|
56
|
+
rake (~> 10.0)
|
57
|
+
rubyscraper!
|
data/README.md
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
# RubyScraper
|
2
|
+
|
3
|
+
Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/rubyscraper`. To experiment with that code, run `bin/console` for an interactive prompt.
|
4
|
+
|
5
|
+
TODO: Delete this and the text above, and describe your gem
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application's Gemfile:
|
10
|
+
|
11
|
+
```ruby
|
12
|
+
gem 'rubyscraper'
|
13
|
+
```
|
14
|
+
|
15
|
+
And then execute:
|
16
|
+
|
17
|
+
$ bundle
|
18
|
+
|
19
|
+
Or install it yourself as:
|
20
|
+
|
21
|
+
$ gem install rubyscraper
|
22
|
+
|
23
|
+
## Usage
|
24
|
+
|
25
|
+
TODO: Write usage instructions here
|
26
|
+
|
27
|
+
## Development
|
28
|
+
|
29
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
|
30
|
+
|
31
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
32
|
+
|
33
|
+
## Contributing
|
34
|
+
|
35
|
+
1. Fork it ( https://github.com/[my-github-username]/rubyscraper/fork )
|
36
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
37
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
38
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
39
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/bin/console
ADDED
data/bin/rubyscraper
ADDED
data/bin/setup
ADDED
data/lib/rubyscraper.rb
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
require 'capybara'
|
2
|
+
require 'capybara/poltergeist'
|
3
|
+
require 'rest-client'
|
4
|
+
require 'rubyscraper/version'
|
5
|
+
|
6
|
+
class RubyScraper
|
7
|
+
include Capybara::DSL
|
8
|
+
|
9
|
+
def initialize(endpoint)
|
10
|
+
Capybara.register_driver :poltergeist do |app|
|
11
|
+
Capybara::Poltergeist::Driver.new(app, js_errors: false)
|
12
|
+
end
|
13
|
+
Capybara.default_driver = :poltergeist
|
14
|
+
@jobs = []
|
15
|
+
@posted_jobs = 0
|
16
|
+
@endpoint = endpoint
|
17
|
+
@search_terms_file = File.expand_path('../assets/search-terms.txt', __FILE__)
|
18
|
+
@search_terms = []
|
19
|
+
File.foreach(@search_terms_file) { |x| @search_terms << x.strip }
|
20
|
+
end
|
21
|
+
|
22
|
+
def scrape
|
23
|
+
get_summaries
|
24
|
+
get_bodies
|
25
|
+
send_to_server
|
26
|
+
return @jobs.length, @posted_jobs
|
27
|
+
end
|
28
|
+
|
29
|
+
def get_summaries
|
30
|
+
@search_terms.each do |term|
|
31
|
+
visit "http://careers.stackoverflow.com/jobs?searchTerm=#{term}&sort=p"
|
32
|
+
(1..2).to_a.each do |page|
|
33
|
+
visit "http://careers.stackoverflow.com/jobs?searchTerm=ruby&sort=p&pg=#{page}"
|
34
|
+
all(".listResults .-item").each do |listing|
|
35
|
+
position = listing.find("h3.-title a").text
|
36
|
+
url = listing.find("h3.-title a")["href"]
|
37
|
+
posting_date = listing.first("p._muted").text
|
38
|
+
|
39
|
+
@jobs << { position: position, url: url, posting_date: posting_date }
|
40
|
+
end
|
41
|
+
end
|
42
|
+
puts "Pulled #{term} job summaries."
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def get_bodies
|
47
|
+
@jobs.each_with_index do |job, i|
|
48
|
+
puts "Job #{i+1} pulled."
|
49
|
+
sleep 1
|
50
|
+
visit "http://careers.stackoverflow.com#{job[:url]}"
|
51
|
+
if has_css?("a.employer")
|
52
|
+
job[:company] = find("a.employer").text
|
53
|
+
end
|
54
|
+
if has_css?("span.location")
|
55
|
+
job[:location] = find("span.location").text
|
56
|
+
end
|
57
|
+
#job[:description] = first("div.description p")
|
58
|
+
description = all("div.description p").map do |p|
|
59
|
+
p.text
|
60
|
+
end
|
61
|
+
job[:description] = description.join("\n")
|
62
|
+
tags = all("div.tags a.post-tag").map do |tag|
|
63
|
+
tag.text
|
64
|
+
end
|
65
|
+
job[:tags] = tags
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def send_to_server
|
70
|
+
@jobs.each_with_index do |job, i|
|
71
|
+
new_job = {
|
72
|
+
position: job[:position],
|
73
|
+
location: job[:location],
|
74
|
+
description: job[:description],
|
75
|
+
source: "http://careers.stackoverflow.com#{job[:url]}"
|
76
|
+
}
|
77
|
+
RestClient.post(@endpoint, job: new_job){ |response, request, result, &block|
|
78
|
+
case response.code
|
79
|
+
when 201
|
80
|
+
@posted_jobs += 1
|
81
|
+
puts "Job saved."
|
82
|
+
when 302
|
83
|
+
puts "Job already exists."
|
84
|
+
else
|
85
|
+
puts "Bad request."
|
86
|
+
end
|
87
|
+
}
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'rubyscraper'
|
2
|
+
|
3
|
+
class RubyScraper
|
4
|
+
class Binary
|
5
|
+
def self.call(argv, outstream, errstream)
|
6
|
+
outstream.puts "StackOverflow Job Scraper"
|
7
|
+
outstream.puts "---------------------------------------------"
|
8
|
+
outstream.puts "Started scraping..."
|
9
|
+
endpoint = argv.first
|
10
|
+
outstream.puts "Sending post requests to #{endpoint}"
|
11
|
+
jobs_scraped, jobs_saved = RubyScraper.new(endpoint).scrape
|
12
|
+
outstream.puts "Scraped #{jobs_scraped} jobs, succesfully posted #{jobs_saved} jobs."
|
13
|
+
outstream.puts "---------------------------------------------"
|
14
|
+
outstream.puts "Completed!"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
data/rubyscraper.gemspec
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
$LOAD_PATH.unshift File.expand_path('../lib', __FILE__)
|
2
|
+
require 'rubyscraper/version'
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = 'rubyscraper'
|
6
|
+
s.version = RubyScraper::VERSION
|
7
|
+
s.licenses = ['MIT']
|
8
|
+
s.summary = "Scrapes the sites..."
|
9
|
+
s.description = "Scrapes job sites for job details and sends post request to server."
|
10
|
+
s.authors = ["Nathan Owsiany"]
|
11
|
+
s.email = 'nowsiany@gmail.com'
|
12
|
+
s.files = Dir["**/*"].select { |f| File.file? f } - Dir['*.gem']
|
13
|
+
s.homepage = 'https://github.com/ndwhtlssthr/rubyscraper'
|
14
|
+
s.executables << 'rubyscraper'
|
15
|
+
|
16
|
+
s.add_dependency "capybara"
|
17
|
+
s.add_dependency "poltergeist"
|
18
|
+
s.add_dependency "rest-client"
|
19
|
+
|
20
|
+
s.add_development_dependency "bundler", "~> 1.9"
|
21
|
+
s.add_development_dependency "rake", "~> 10.0"
|
22
|
+
s.add_development_dependency 'rspec', '~> 3.0'
|
23
|
+
s.add_development_dependency 'pry'
|
24
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,156 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rubyscraper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Nathan Owsiany
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-04-23 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: capybara
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: poltergeist
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rest-client
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: bundler
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '1.9'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '1.9'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rake
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '10.0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '10.0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rspec
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '3.0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '3.0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: pry
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
description: Scrapes job sites for job details and sends post request to server.
|
112
|
+
email: nowsiany@gmail.com
|
113
|
+
executables:
|
114
|
+
- rubyscraper
|
115
|
+
extensions: []
|
116
|
+
extra_rdoc_files: []
|
117
|
+
files:
|
118
|
+
- Gemfile
|
119
|
+
- Gemfile.lock
|
120
|
+
- README.md
|
121
|
+
- Rakefile
|
122
|
+
- bin/console
|
123
|
+
- bin/rubyscraper
|
124
|
+
- bin/setup
|
125
|
+
- lib/assets/search-terms.txt
|
126
|
+
- lib/rubyscraper.rb
|
127
|
+
- lib/rubyscraper/binary.rb
|
128
|
+
- lib/rubyscraper/version.rb
|
129
|
+
- rubyscraper.gemspec
|
130
|
+
- spec/rubyscraper_spec.rb
|
131
|
+
- spec/spec_helper.rb
|
132
|
+
homepage: https://github.com/ndwhtlssthr/rubyscraper
|
133
|
+
licenses:
|
134
|
+
- MIT
|
135
|
+
metadata: {}
|
136
|
+
post_install_message:
|
137
|
+
rdoc_options: []
|
138
|
+
require_paths:
|
139
|
+
- lib
|
140
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
141
|
+
requirements:
|
142
|
+
- - ">="
|
143
|
+
- !ruby/object:Gem::Version
|
144
|
+
version: '0'
|
145
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
146
|
+
requirements:
|
147
|
+
- - ">="
|
148
|
+
- !ruby/object:Gem::Version
|
149
|
+
version: '0'
|
150
|
+
requirements: []
|
151
|
+
rubyforge_project:
|
152
|
+
rubygems_version: 2.4.6
|
153
|
+
signing_key:
|
154
|
+
specification_version: 4
|
155
|
+
summary: Scrapes the sites...
|
156
|
+
test_files: []
|