socrates_scraper 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/.gitignore +2 -0
- data/README.md +0 -0
- data/gemfile +4 -0
- data/lib/socrates_scraper.rb +2 -0
- data/lib/socrates_scraper/scraper.rb +88 -0
- data/lib/socrates_scraper/student.rb +9 -0
- data/socrates_scraper.gemspec +25 -0
- metadata +106 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
YmFmZDIxNDVmOGFhYTI5ODY1NDQ5YmRmMWMwNDczZjNjNzU0NTkzYQ==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
NmFmYTRhYWZmZjI4YmNlZmQyNGE2NDIyOTVkZTE4ZjFlZjE4ZWUwYw==
|
7
|
+
SHA512:
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
NjdjMTE0ODVkZjQ0ZDQ2NWJhNzY2Yzk1ZTFkNzYzMGMxY2RhZDMyNGUxYjk2
|
10
|
+
ZDlkYTI4N2JmZWUyMjA0YjBmMTM4ZDIxZjFiMTQ0YmYzMjYxY2UzODBhZWNl
|
11
|
+
ZDczMGMxNGMyNGE4ZjBjYjQxMDg0Y2U4ZjZmNjE1NmIwOWUyYjQ=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
YjliMDU0YTY1NmUwZjQ2Y2RlNjQxNTdlMTNhNjdiNDExZTUxYzFhZTJhOWM5
|
14
|
+
ZjdjM2MzODFjOTEwMWM0Y2ZlNzIyOGIzYTk5MDk3YzBmZTBiOWMzYTEzMDRh
|
15
|
+
M2RmNTdlNjY5YmNhMGJhOTQwOTlkMTBiMjM4ZjY0M2Y5Mjc5ZjY=
|
data/.gitignore
ADDED
data/README.md
ADDED
File without changes
|
data/gemfile
ADDED
@@ -0,0 +1,88 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
|
3
|
+
class Scraper
|
4
|
+
URL = "https://socrates.devbootcamp.com/login"
|
5
|
+
|
6
|
+
def initialize(email, password)
|
7
|
+
@email = email
|
8
|
+
@password = password
|
9
|
+
|
10
|
+
@agent = Mechanize.new
|
11
|
+
end
|
12
|
+
|
13
|
+
def get_user_profiles
|
14
|
+
cohort_page = get_cohort_page
|
15
|
+
user_links = get_user_links(cohort_page)
|
16
|
+
|
17
|
+
raise "Invalid username or password" if user_links.empty?
|
18
|
+
|
19
|
+
puts "Getting user profiles..."
|
20
|
+
create_user_profiles(user_links)
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def get_cohort_page
|
26
|
+
@agent.get(URL)
|
27
|
+
login
|
28
|
+
@agent.get("https://socrates.devbootcamp.com/cohorts/78")
|
29
|
+
end
|
30
|
+
|
31
|
+
def login
|
32
|
+
form = @agent.page.forms.first
|
33
|
+
form.email = @email
|
34
|
+
form.password = @password
|
35
|
+
form.submit
|
36
|
+
end
|
37
|
+
|
38
|
+
def get_user_links(page)
|
39
|
+
users = page.links_with(href: /users/)
|
40
|
+
|
41
|
+
# remove top link 'My Profile'
|
42
|
+
users.shift
|
43
|
+
|
44
|
+
# deduplicate links (photo link + text link)
|
45
|
+
users.uniq { |link| link.href }
|
46
|
+
end
|
47
|
+
|
48
|
+
def create_user_profiles(user_links)
|
49
|
+
user_links.map do |link|
|
50
|
+
# reset to cohort page after every user
|
51
|
+
@agent.transact do
|
52
|
+
user_html = extract_profile_html(link)
|
53
|
+
convert_html_to_person_hash(user_html)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def extract_profile_html(link)
|
59
|
+
page = @agent.click(link)
|
60
|
+
|
61
|
+
# get the socrates id and convert into an
|
62
|
+
# easily identifiable html element
|
63
|
+
socrates_id = link.href[/.*\/(\d*)/, 1]
|
64
|
+
socrates_tag = "<p class='soc_id'>#{socrates_id}</p>"
|
65
|
+
|
66
|
+
Nokogiri::HTML(page.search('div.profile').to_s << socrates_tag)
|
67
|
+
end
|
68
|
+
|
69
|
+
def convert_html_to_person_hash(html)
|
70
|
+
person = {}
|
71
|
+
person[:name] = get_name(html)
|
72
|
+
person[:image] = get_image(html)
|
73
|
+
person[:facebook_id] = get_facebook_id(html)
|
74
|
+
person
|
75
|
+
end
|
76
|
+
|
77
|
+
def get_name(html)
|
78
|
+
html.css('h1 > text()').first.text.lstrip.rstrip
|
79
|
+
end
|
80
|
+
|
81
|
+
def get_image(html)
|
82
|
+
html.css('.user > img').first['src']
|
83
|
+
end
|
84
|
+
|
85
|
+
def get_facebook_id(html)
|
86
|
+
html.css('dd')[4].text[/.*\/(.*)/, 1]
|
87
|
+
end
|
88
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
lib = File.expand_path('../lib', __FILE__)
|
2
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
|
+
|
4
|
+
Gem::Specification.new do |spec|
|
5
|
+
spec.name = 'socrates_scraper'
|
6
|
+
spec.version = '0.0.1'
|
7
|
+
spec.date = '2014-08-22'
|
8
|
+
spec.summary = "Get user info from a Socrates cohort"
|
9
|
+
spec.description = "Scrapes user info from a cohort in Socrates"
|
10
|
+
spec.authors = ["James Robinson"]
|
11
|
+
spec.email = 'james.michael.robinson@gmail.com'
|
12
|
+
spec.files = ["lib/facebook_word_counter.rb"]
|
13
|
+
spec.homepage = 'http://rubygems.org/gems/facebook_word_counter'
|
14
|
+
spec.license = 'MIT'
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
22
|
+
spec.add_development_dependency "rake"
|
23
|
+
spec.add_runtime_dependency "nokogiri", '~> 1.6'
|
24
|
+
spec.add_runtime_dependency 'mechanize', '~> 2.7.3'
|
25
|
+
end
|
metadata
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: socrates_scraper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- James Robinson
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-08-22 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.3'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.3'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ! '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ! '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: nokogiri
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ~>
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.6'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ~>
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.6'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: mechanize
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 2.7.3
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ~>
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 2.7.3
|
69
|
+
description: Scrapes user info from a cohort in Socrates
|
70
|
+
email: james.michael.robinson@gmail.com
|
71
|
+
executables: []
|
72
|
+
extensions: []
|
73
|
+
extra_rdoc_files: []
|
74
|
+
files:
|
75
|
+
- .gitignore
|
76
|
+
- README.md
|
77
|
+
- gemfile
|
78
|
+
- lib/socrates_scraper.rb
|
79
|
+
- lib/socrates_scraper/scraper.rb
|
80
|
+
- lib/socrates_scraper/student.rb
|
81
|
+
- socrates_scraper.gemspec
|
82
|
+
homepage: http://rubygems.org/gems/facebook_word_counter
|
83
|
+
licenses:
|
84
|
+
- MIT
|
85
|
+
metadata: {}
|
86
|
+
post_install_message:
|
87
|
+
rdoc_options: []
|
88
|
+
require_paths:
|
89
|
+
- lib
|
90
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
91
|
+
requirements:
|
92
|
+
- - ! '>='
|
93
|
+
- !ruby/object:Gem::Version
|
94
|
+
version: '0'
|
95
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
96
|
+
requirements:
|
97
|
+
- - ! '>='
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: '0'
|
100
|
+
requirements: []
|
101
|
+
rubyforge_project:
|
102
|
+
rubygems_version: 2.1.5
|
103
|
+
signing_key:
|
104
|
+
specification_version: 4
|
105
|
+
summary: Get user info from a Socrates cohort
|
106
|
+
test_files: []
|