linsc 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.rspec +2 -0
- data/.travis.yml +4 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +40 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/linsc +5 -0
- data/bin/setup +8 -0
- data/data/agents.txt +10 -0
- data/data/proxies.txt +0 -0
- data/data/recruiters.txt +0 -0
- data/lib/linsc.rb +159 -0
- data/lib/linsc/cross_ref.rb +113 -0
- data/lib/linsc/csv_handlers.rb +53 -0
- data/lib/linsc/duck.rb +179 -0
- data/lib/linsc/lin.rb +303 -0
- data/lib/linsc/merger.rb +70 -0
- data/lib/linsc/parsers.rb +320 -0
- data/lib/linsc/proxy.rb +30 -0
- data/lib/linsc/proxy_handler.rb +42 -0
- data/linsc-0.0.1.gem +0 -0
- data/linsc.gemspec +31 -0
- metadata +140 -0
data/lib/linsc/merger.rb
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
require_relative 'csv_handlers'
|
2
|
+
|
3
|
+
|
4
|
+
class Merger
|
5
|
+
include CSVHandlers
|
6
|
+
|
7
|
+
def initialize(input_dir, output_path, mapping = nil)
|
8
|
+
@input_dir, @output_path, @mapping = input_dir, output_path, mapping
|
9
|
+
recruiter_file = Pathname.new(File.dirname __dir__).realdirpath + '../data/recruiters.txt'
|
10
|
+
@recruiters = recruiter_file.read.split(",").collect{|r| r.strip}
|
11
|
+
@lin_files = @input_dir.children.select{|fn| fn.to_s.match(/LIN.+\.csv/)}
|
12
|
+
if mapping
|
13
|
+
@headers = mapping.values
|
14
|
+
else
|
15
|
+
@headers = get_headers(@lin_files.first)
|
16
|
+
end
|
17
|
+
if File.exist?(@output_path)
|
18
|
+
File.delete(@output_path)
|
19
|
+
end
|
20
|
+
create_file(@output_path)
|
21
|
+
end
|
22
|
+
|
23
|
+
def construct_emails_hash
|
24
|
+
emails = {}
|
25
|
+
@lin_files.each do |pn|
|
26
|
+
lin_file = pn.to_s
|
27
|
+
recruiter_name = lin_file.match(/LIN[^.]+/)[0]
|
28
|
+
puts "merging #{recruiter_name}"
|
29
|
+
clean_file = File.read(lin_file, encoding: 'windows-1252').strip
|
30
|
+
CSV.parse(clean_file, headers: true, encoding: 'windows-1252') do |row|
|
31
|
+
row["Recruiter"] = recruiter_name
|
32
|
+
email = row['E-mail Address']&.downcase
|
33
|
+
if emails.has_key?(email)
|
34
|
+
emails[email] << row
|
35
|
+
else
|
36
|
+
emails[email] = [row]
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
emails
|
41
|
+
end
|
42
|
+
|
43
|
+
def merge
|
44
|
+
emails = construct_emails_hash
|
45
|
+
i = 0
|
46
|
+
j = emails.length
|
47
|
+
emails.each do |ek, ev|
|
48
|
+
i += 1
|
49
|
+
puts "merging - row #{i}/#{j}"
|
50
|
+
correct_row = ev.find do |row|
|
51
|
+
row['Recruiter'] == @recruiters.find do |rec|
|
52
|
+
ev.collect {|row| row['Recruiter']}.include?(rec)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
if @mapping
|
56
|
+
output_row = CSV::Row.new(@headers, [])
|
57
|
+
correct_row.each do |key, value|
|
58
|
+
if @mapping[key]
|
59
|
+
output_row[@mapping[key]] = value&.encode('utf-8')
|
60
|
+
end
|
61
|
+
end
|
62
|
+
output_row['Email'] = output_row['Email']&.downcase
|
63
|
+
else
|
64
|
+
output_row = create_row(correct_row, @headers, 'utf-8')
|
65
|
+
end
|
66
|
+
append_to_csv(@output_path, output_row)
|
67
|
+
end
|
68
|
+
@output_path
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,320 @@
|
|
1
|
+
module Parsers
|
2
|
+
def scrape_contact(input_row, page, mode)
|
3
|
+
row = CSV::Row.new(@headers, [])
|
4
|
+
name = page.at_css("#name")&.text&.split
|
5
|
+
contact_id = input_row["Contact ID"]
|
6
|
+
lin_id = input_row["LIN ID"]
|
7
|
+
cv_tr = input_row["CV TR"]
|
8
|
+
acc_name = input_row["Account Name"]
|
9
|
+
import_status = input_row["Linkedin Import Status"]
|
10
|
+
email = input_row["Email"]
|
11
|
+
lin_profile = input_row["Linkedin Profile"]
|
12
|
+
cand_id = input_row["Candidate ID"]
|
13
|
+
cand_source = input_row["LIN 1st Degree"]
|
14
|
+
title = page.at_css(".headline.title")&.text
|
15
|
+
country = page.at_css("#demographics .locality")&.text
|
16
|
+
sector = page.at_css("#demographics .descriptor:not(.adr)")&.text
|
17
|
+
|
18
|
+
positions = page.css("#experience .positions .position")
|
19
|
+
if positions
|
20
|
+
e1_title = positions[0]&.at_css(".item-title")&.text
|
21
|
+
e1_org = positions[0]&.at_css(".item-subtitle")&.text
|
22
|
+
e1_start = positions[0]&.css(".date-range time")[0]&.text
|
23
|
+
e1_end = positions[0]&.css(".date-range time")[1]&.text
|
24
|
+
e1_loc = positions[0]&.at_css(".location")&.text
|
25
|
+
e1_desc = positions[0]&.at_css(".description")&.text
|
26
|
+
e2_title = positions[1]&.at_css(".item-title")&.text
|
27
|
+
e2_org = positions[1]&.at_css(".item-subtitle")&.text
|
28
|
+
e2_start = positions[1]&.css(".date-range time")[0]&.text
|
29
|
+
e2_end = positions[1]&.css(".date-range time")[1]&.text
|
30
|
+
e2_loc = positions[1]&.at_css(".location")&.text
|
31
|
+
e2_desc = positions[1]&.at_css(".description")&.text
|
32
|
+
e3_title = positions[2]&.at_css(".item-title")&.text
|
33
|
+
e3_org = positions[2]&.at_css(".item-subtitle")&.text
|
34
|
+
e3_start = positions[2]&.css(".date-range time")[0]&.text
|
35
|
+
e3_end = positions[2]&.css(".date-range time")[1]&.text
|
36
|
+
e3_loc = positions[2]&.at_css(".location")&.text
|
37
|
+
e3_desc = positions[2]&.at_css(".description")&.text
|
38
|
+
end
|
39
|
+
|
40
|
+
certs = page.css(".certifications .certification")
|
41
|
+
if certs
|
42
|
+
c1_name = certs[0]&.at_css(".item-title")&.text
|
43
|
+
c2_name = certs[1]&.at_css(".item-title")&.text
|
44
|
+
c_type = certs[0]&.at_css(".item-subtitle")&.text
|
45
|
+
end
|
46
|
+
|
47
|
+
schools = page.css("#education .schools .school")
|
48
|
+
if schools
|
49
|
+
s1_name = schools[0]&.at_css(".item-title")&.text
|
50
|
+
s2_name = schools[1]&.at_css(".item-title")&.text
|
51
|
+
s1_start = schools[0]&.css(".date-range time")[0]&.text
|
52
|
+
s2_start = schools[1]&.css(".date-range time")[0]&.text
|
53
|
+
s1_end = schools[0]&.css(".date-range time")[1]&.text
|
54
|
+
s2_end = schools[1]&.css(".date-range time")[1]&.text
|
55
|
+
s1_degree = schools[0]&.at_css(".item-subtitle")&.text
|
56
|
+
s2_degree = schools[1]&.at_css(".item-subtitle")&.text
|
57
|
+
end
|
58
|
+
|
59
|
+
summary = page.at_css("#summary .description")
|
60
|
+
summary&.css('br').each{|br| br.replace "\n"} if summary
|
61
|
+
|
62
|
+
text_resume = "\n\n***IMPORTED FROM LINKEDIN***\n#{lin_profile}\n\n"
|
63
|
+
text_resume += name.join(" ")
|
64
|
+
text_resume += "\n#{email}"
|
65
|
+
text_resume += "\nTitle: #{title}" if title
|
66
|
+
text_resume += "\nLocation: #{country}" if country
|
67
|
+
text_resume += "\nSector: #{sector}" if sector
|
68
|
+
text_resume += "\n\nSUMMARY\n#{summary.text}" if summary
|
69
|
+
text_resume += "\n\nEXPERIENCE\n" if positions && positions.length > 0
|
70
|
+
positions.each do |position|
|
71
|
+
jtitle = position.at_css(".item-title")
|
72
|
+
jcompany = position.at_css(".item-subtitle")
|
73
|
+
jdates = position.at_css(".date-range")
|
74
|
+
jlocation = position.at_css(".location")
|
75
|
+
jdesc = position.at_css(".description")
|
76
|
+
jdesc.css('br').each{|br| br.replace "\n"} if jdesc
|
77
|
+
text_resume += "\n#{jtitle.text}\n" if jtitle
|
78
|
+
text_resume += " - #{jcompany.text}\n" if jcompany && jcompany.text.length > 0
|
79
|
+
text_resume += "#{jdates.text}\n" if jdates
|
80
|
+
text_resume += "#{jlocation.text}\n" if jlocation
|
81
|
+
text_resume += "#{jdesc.text}\n" if jdesc
|
82
|
+
end
|
83
|
+
text_resume += "\n\nEDUCATION\n" if schools && schools.length > 0
|
84
|
+
schools.each do |school|
|
85
|
+
stitle = school.at_css(".item-title")
|
86
|
+
sdegree = school.at_css(".item-subtitle")
|
87
|
+
sdates = school.at_css(".date-range")
|
88
|
+
sdesc = school.at_css(".description")
|
89
|
+
sdesc.css('br').each{|br| br.replace "\n"} if sdesc
|
90
|
+
text_resume += "\n#{stitle.text}\n" if stitle
|
91
|
+
text_resume += " - #{sdegree.text}\n" if sdegree && sdegree.text.length > 0
|
92
|
+
text_resume += "#{sdates.text}\n" if sdates
|
93
|
+
text_resume += "#{sdesc.text}\n" if sdesc
|
94
|
+
end
|
95
|
+
text_resume += "\n\nCERTIFICATIONS\n" if certs && certs.length > 0
|
96
|
+
certs.each do |cert|
|
97
|
+
ctitle = cert.at_css(".item-title")
|
98
|
+
csub = cert.at_css(".item-subtitle")
|
99
|
+
cdates = cert.at_css(".date-range")
|
100
|
+
text_resume += "\n#{ctitle.text}\n" if ctitle
|
101
|
+
text_resume += "#{csub.text}\n" if csub
|
102
|
+
text_resume += "#{cdates.text}\n" if cdates
|
103
|
+
end
|
104
|
+
interests = page.css("#interests .pills .interest")
|
105
|
+
text_resume += "\nINTERESTS\n" if interests && interests.length > 0
|
106
|
+
ints = []
|
107
|
+
interests.each do |interest|
|
108
|
+
int = interest.at_css(".wrap")&.text
|
109
|
+
if int
|
110
|
+
ints << int unless (int == "See less") || (int.match(/See \d+\+/))
|
111
|
+
end
|
112
|
+
end
|
113
|
+
text_resume += "#{ints.join(", ")}\n\n"
|
114
|
+
skills = page.css("#skills .pills .skill")
|
115
|
+
text_resume += "\n\nSKILLS\n" if skills && skills.length > 0
|
116
|
+
sks = []
|
117
|
+
skills.each do |skill|
|
118
|
+
sk = skill.at_css(".wrap")&.text
|
119
|
+
if sk
|
120
|
+
sks << sk unless (sk == "See less") || (sk.match(/See \d+\+/))
|
121
|
+
end
|
122
|
+
end
|
123
|
+
text_resume += "#{sks.join(", ")}\n\n"
|
124
|
+
languages = page.css("#languages .language")
|
125
|
+
text_resume += "\n\nLANGUAGES\n" if languages.length > 0
|
126
|
+
langs = []
|
127
|
+
languages.each do |language|
|
128
|
+
lang = language.at_css(".name")&.text
|
129
|
+
prof = language.at_css(".proficiency")
|
130
|
+
lang += " (#{prof.text})" if prof && prof.text.length > 0
|
131
|
+
langs << lang if lang
|
132
|
+
end
|
133
|
+
text_resume += "#{langs.join(", ")}\n\n"
|
134
|
+
projects = page.css("#projects .project")
|
135
|
+
text_resume += "\n\nPROJECTS\n" if projects && projects.length > 0
|
136
|
+
projects.each do |project|
|
137
|
+
ptitle = project.at_css(".item-title")
|
138
|
+
pdates = project.at_css(".date-range")
|
139
|
+
pdesc = project.at_css(".description")
|
140
|
+
pdesc.css('br').each{|br| br.replace "\n"} if pdesc
|
141
|
+
pcont = project.at_css(".contributors")
|
142
|
+
text_resume += "\n#{ptitle.text}\n" if ptitle
|
143
|
+
text_resume += "#{pdates.text}\n" if pdates
|
144
|
+
text_resume += "#{pdesc.text}\n" if pdesc
|
145
|
+
text_resume += "#{pcont.text}\n " if pcont
|
146
|
+
end
|
147
|
+
pubs = page.css("#publications .publication")
|
148
|
+
text_resume += "\n\nPUBLICATIONS\n" if pubs && pubs.length > 0
|
149
|
+
pubs.each do |pub|
|
150
|
+
pubtitle = pub.at_css(".item-title")
|
151
|
+
pubsub = pub.at_css(".item-subtitle")
|
152
|
+
pubdates = pub.at_css(".date-range")
|
153
|
+
pubdesc = pub.at_css(".description")
|
154
|
+
pubdesc.css('br').each{|br| br.replace "\n"} if pubdesc
|
155
|
+
pubcont = pub.at_css(".contributors")
|
156
|
+
text_resume += "\n#{pubtitle.text}\n" if pubtitle
|
157
|
+
text_resume += "#{pubsub.text}\n" if pubsub
|
158
|
+
text_resume += "#{pubdates.text}\n" if pubdates
|
159
|
+
text_resume += "#{pubdesc.text}\n" if pubdesc
|
160
|
+
text_resume += "#{pubcont.text}\n" if pubcont
|
161
|
+
end
|
162
|
+
vols = page.css("#volunteering .position")
|
163
|
+
text_resume += "\n\nVOLUNTEERING\n" if vols && vols.length > 0
|
164
|
+
vols.each do |vol|
|
165
|
+
voltitle = vol.at_css(".item-title")
|
166
|
+
volsub = vol.at_css(".item-subtitle")
|
167
|
+
voldates = vol.at_css(".date-range")
|
168
|
+
voldesc = vol.at_css(".description")
|
169
|
+
voldesc.css('br').each{|br| br.replace "\n"} if voldesc
|
170
|
+
volcause = vol.at_css(".cause")
|
171
|
+
text_resume += "\n#{voltitle.text}\n" if voltitle
|
172
|
+
text_resume += "#{volsub.text}\n" if volsub
|
173
|
+
text_resume += "#{voldates.text}\n" if voldates
|
174
|
+
text_resume += "Cause: #{volcause.text}\n" if volcause
|
175
|
+
text_resume += "#{voldesc.text}\n" if voldesc
|
176
|
+
end
|
177
|
+
orgs = page.css("#organizations li")
|
178
|
+
text_resume += "\n\nORGANIZATIONS\n" if orgs && orgs.length > 0
|
179
|
+
orgs.each do |org|
|
180
|
+
orgtitle = org.at_css(".item-title")
|
181
|
+
orgsub = org.at_css(".item-subtitle")
|
182
|
+
orgdates = org.at_css(".date-range")
|
183
|
+
orgdesc = org.at_css(".description")
|
184
|
+
orgdesc.css('br').each{|br| br.replace "\n"} if orgdesc
|
185
|
+
text_resume += "\n#{orgtitle.text}\n" if orgtitle
|
186
|
+
text_resume += "#{orgsub.text}\n" if orgsub
|
187
|
+
text_resume += "#{orgdates.text}\n" if orgdates
|
188
|
+
text_resume += "#{orgdesc.text}\n" if orgdesc
|
189
|
+
end
|
190
|
+
pats = page.css("#patents .patent")
|
191
|
+
text_resume += "\n\nPATENTS\n" if pats && pats.length > 0
|
192
|
+
pats.each do |pat|
|
193
|
+
pattitle = pat.at_css(".item-title")
|
194
|
+
patsub = pat.at_css(".item-subtitle")
|
195
|
+
patdates = pat.at_css(".date-range")
|
196
|
+
patdesc = pat.at_css(".description")
|
197
|
+
patdesc.css('br').each{|br| br.replace "\n"} if patdesc
|
198
|
+
patcont = pat.at_css(".contributors")
|
199
|
+
text_resume += "\n#{pattitle.text}\n" if pattitle
|
200
|
+
text_resume += "#{patsub.text}\n" if patsub
|
201
|
+
text_resume += "#{patdates.text}\n" if patdates
|
202
|
+
text_resume += "#{patdesc.text}\n" if patdesc
|
203
|
+
text_resume += "#{patcont.text}\n" if patcont
|
204
|
+
end
|
205
|
+
awards = page.css("#awards .award")
|
206
|
+
text_resume += "\n\nAWARDS\n" if awards && awards.length > 0
|
207
|
+
awards.each do |award|
|
208
|
+
atitle = award.at_css(".item-title")
|
209
|
+
asub = award.at_css(".item-subtitle")
|
210
|
+
adates = award.at_css(".date-range")
|
211
|
+
adesc = award.at_css(".description")
|
212
|
+
adesc.css('br').each{|br| br.replace "\n"} if adesc
|
213
|
+
text_resume += "\n#{atitle.text}\n" if atitle
|
214
|
+
text_resume += "#{asub.text}\n" if asub
|
215
|
+
text_resume += "#{adates.text}\n" if adates
|
216
|
+
text_resume += "#{adesc.text}\n" if adesc
|
217
|
+
end
|
218
|
+
courses = page.css("#courses li")
|
219
|
+
text_resume += "\n\nCOURSES\n" if courses && courses.length > 0
|
220
|
+
courses.each do |course|
|
221
|
+
coutitle = course.at_css(".item-title")
|
222
|
+
coulist = course.at_css(".courses-list")
|
223
|
+
text_resume += "\n#{coutitle.text}\n" if coutitle
|
224
|
+
text_resume += "#{coulist.text}\n" if coulist
|
225
|
+
end
|
226
|
+
|
227
|
+
|
228
|
+
row["Contact ID"] = contact_id
|
229
|
+
row["LIN ID"] = lin_id
|
230
|
+
row["CV TR"] = "1"
|
231
|
+
row["Account Name"] = acc_name
|
232
|
+
row["Linkedin Import Status"] = import_status
|
233
|
+
row["First Name"] = name[0]&.slice(0, 39)
|
234
|
+
row["Last Name"] = name[1..-1]&.join(" ")&.slice(0, 79)
|
235
|
+
row["Email"] = email
|
236
|
+
row["Candidate ID"] = cand_id
|
237
|
+
row["LIN 1st Degree"] = cand_source
|
238
|
+
row["Title"] = title&.slice(0, 127)
|
239
|
+
row["Contact Country"] = country
|
240
|
+
row["Contact LIN Sector"] = sector&.slice(0, 99)
|
241
|
+
row["Employer 1 Title"] = e1_title&.slice(0, 31999)
|
242
|
+
row["Employer Organization Name 1"] = e1_org&.slice(0, 254)
|
243
|
+
row["Employer 1 Start Date"] = format_date(e1_start) #format
|
244
|
+
row["Employer 1 End Date"] = format_date(e1_end) #format
|
245
|
+
row["Employer 1 Location"] = e1_loc&.slice(0, 254)
|
246
|
+
row["Employer 1 Description"] = e1_desc&.slice(0, 31999)
|
247
|
+
row["Employer 2 Title"] = e2_title&.slice(0, 31999)
|
248
|
+
row["Employer Organization Name 2"] = e2_org&.slice(0, 254)
|
249
|
+
row["Employer 2 Start Date"] = format_date(e2_start) #format
|
250
|
+
row["Employer 2 End Date"] = format_date(e2_end) #format
|
251
|
+
row["Employer 2 Location"] = e2_loc&.slice(0, 254)
|
252
|
+
row["Employer 2 Description"] = e2_desc&.slice(0, 31999)
|
253
|
+
row["Employer 3 Title"] = e3_title&.slice(0, 31999)
|
254
|
+
row["Employer Organization Name 3"] = e3_org&.slice(0, 254)
|
255
|
+
row["Employer 3 Start Date"] = format_date(e3_start) #format
|
256
|
+
row["Employer 3 End Date"] = format_date(e3_end) #format
|
257
|
+
row["Employer 3 Location"] = e3_loc&.slice(0, 254)
|
258
|
+
row["Employer 3 Description"] = e3_desc&.slice(0, 31999)
|
259
|
+
row["License or Certification Name 1"] = c1_name&.slice(0, 254)
|
260
|
+
row["License or Certification Name 2"] = c2_name&.slice(0, 254)
|
261
|
+
row["License or Certification Credential Type"] = c_type&.slice(0, 254)
|
262
|
+
row["Education School 1"] = s1_name&.slice(0, 124)
|
263
|
+
row["Education Degree Name 1"] = s1_degree&.slice(0, 254)
|
264
|
+
row["Education Degree Date 1"] = format_date(s1_end)
|
265
|
+
row["Education School 2"] = s2_name&.slice(0, 124)
|
266
|
+
row["Education Degree Name 2"] = s2_degree&.slice(0, 254)
|
267
|
+
row["Education Degree Date 2"] = format_date(s2_end)
|
268
|
+
row["Text Resume"] = text_resume&.slice(0, 31999)
|
269
|
+
row["LinkedIn Profile"] = lin_profile&.slice(0, 254)
|
270
|
+
row["Resume Last Updated"] = Time.now.strftime('%Y-%m-%d %H:%M:%S')
|
271
|
+
row["LIN Import Date"] = Time.now.strftime('%Y-%m-%d')
|
272
|
+
row["CV Uploaded"] = "1"
|
273
|
+
|
274
|
+
row
|
275
|
+
|
276
|
+
end
|
277
|
+
|
278
|
+
def scrape_education(input_row, page)
|
279
|
+
rows = []
|
280
|
+
schools = page.css("#education .schools .school")
|
281
|
+
|
282
|
+
schools.each do |school|
|
283
|
+
row = CSV::Row.new(@education_headers, [])
|
284
|
+
row["Contact"] = input_row["Contact ID"]
|
285
|
+
row["LIN ID"] = input_row["LIN ID"]
|
286
|
+
row["School Name"] = school.at_css(".item-title").text.slice(0, 149)
|
287
|
+
row["Major"] = school.at_css(".item-subtitle").text.slice(0, 254)
|
288
|
+
dstart = school.css(".date-range time")[0]
|
289
|
+
dend = school.css(".date-range time")[1]
|
290
|
+
if dend
|
291
|
+
row["Graduation Year"] = dend.text.gsub(/\D/, '').slice(0, 74)
|
292
|
+
else
|
293
|
+
row["Graduation Year"] = dstart.text.gsub(/\D/, '').slice(0, 74)
|
294
|
+
end
|
295
|
+
rows << row
|
296
|
+
end
|
297
|
+
rows
|
298
|
+
end
|
299
|
+
|
300
|
+
def scrape_employment(input_row, page)
|
301
|
+
rows = []
|
302
|
+
positions = page.css("#experience .positions .position")
|
303
|
+
|
304
|
+
positions.each do |position|
|
305
|
+
row = CSV::Row.new(@employment_headers, [])
|
306
|
+
row["Contact"] = input_row["Contact ID"]
|
307
|
+
row["LIN ID"] = input_row["LIN ID"]
|
308
|
+
row["Job Title"] = position.at_css(".item-title").text.slice(0, 74)
|
309
|
+
row["Employer Name"] = position.at_css(".item-subtitle").text.slice(0, 149)
|
310
|
+
jstart = position.css(".date-range time")[0]
|
311
|
+
jend = position.css(".date-range time")[1]
|
312
|
+
row["Start Date"] = format_date(jstart.text)
|
313
|
+
row["End Date"] = format_date(jend.text)
|
314
|
+
row["Location"] = position.at_css(".location").text.slice(0, 254)
|
315
|
+
rows << row
|
316
|
+
end
|
317
|
+
rows
|
318
|
+
end
|
319
|
+
|
320
|
+
end
|
data/lib/linsc/proxy.rb
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
class Proxy
|
2
|
+
attr_accessor :ip, :port, :username, :password, :status, :last_used, :user_agent
|
3
|
+
|
4
|
+
def initialize(ip:, port: 80, username: nil, password: nil, status: nil, last_used: nil, user_agent: nil)
|
5
|
+
@ip, @port, @username, @password, @status, @last_used =
|
6
|
+
ip, port, username, password, status, last_used
|
7
|
+
end
|
8
|
+
|
9
|
+
def dead
|
10
|
+
@status = 'dead'
|
11
|
+
@last_used = Time.now
|
12
|
+
end
|
13
|
+
|
14
|
+
def good
|
15
|
+
@status = 'good'
|
16
|
+
@last_used = Time.now
|
17
|
+
end
|
18
|
+
|
19
|
+
def good?
|
20
|
+
@status == 'good' ? true : false
|
21
|
+
end
|
22
|
+
|
23
|
+
def dead?
|
24
|
+
@status == 'dead' ? true : false
|
25
|
+
end
|
26
|
+
|
27
|
+
def used
|
28
|
+
@last_used = Time.now
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require_relative 'proxy'
|
2
|
+
|
3
|
+
class ProxyHandler
|
4
|
+
|
5
|
+
def initialize(cooldown_time = 5)
|
6
|
+
@cooldown_time = cooldown_time
|
7
|
+
@proxy_list = File.read('./../data/proxies.txt').split("\n")
|
8
|
+
.collect{|proxy| proxy.split(':')}
|
9
|
+
@proxies = []
|
10
|
+
@ua_list = File.read('./../data/agents.txt').split("\n")
|
11
|
+
|
12
|
+
@proxy_list.each do |proxy_details|
|
13
|
+
proxy = Proxy.new(ip: proxy_details[0], port: proxy_details[1],
|
14
|
+
username: proxy_details[2], password: proxy_details[3], status: 'good',
|
15
|
+
last_used: Time.now - @cooldown_time, user_agent: @ua_list.shift)
|
16
|
+
@proxies << proxy
|
17
|
+
end
|
18
|
+
if @proxies.length == 0
|
19
|
+
puts "proxies.txt is empty! if you don't want to use any proxies, use the -n flag. see docs for more."
|
20
|
+
exit
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
def get_proxy
|
26
|
+
@good_proxies = @proxies.select { |proxy| proxy.good? }
|
27
|
+
if @good_proxies.length > 0
|
28
|
+
@good_proxies.sort!{|a, b| a.last_used <=> b.last_used}
|
29
|
+
best_proxy = @good_proxies.first
|
30
|
+
duration = Time.now - best_proxy.last_used
|
31
|
+
sleep(@cooldown_time - duration) if duration < @cooldown_time
|
32
|
+
best_proxy
|
33
|
+
else
|
34
|
+
puts "All proxies are dead. Wait a few hours before resuming."
|
35
|
+
exit
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def length
|
40
|
+
@proxies.length
|
41
|
+
end
|
42
|
+
end
|