linkedin-scraper 2.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. usr/lib/python3.6/site-packages/linkedin_scraper/__init__.py +10 -0
  2. usr/lib/python3.6/site-packages/linkedin_scraper/__pycache__/__init__.cpython-36.pyc +0 -0
  3. usr/lib/python3.6/site-packages/linkedin_scraper/__pycache__/company.cpython-36.pyc +0 -0
  4. usr/lib/python3.6/site-packages/linkedin_scraper/__pycache__/functions.cpython-36.pyc +0 -0
  5. usr/lib/python3.6/site-packages/linkedin_scraper/__pycache__/objects.cpython-36.pyc +0 -0
  6. usr/lib/python3.6/site-packages/linkedin_scraper/__pycache__/person.cpython-36.pyc +0 -0
  7. usr/lib/python3.6/site-packages/linkedin_scraper/__pycache__/scraper.cpython-36.pyc +0 -0
  8. usr/lib/python3.6/site-packages/linkedin_scraper/company.py +213 -0
  9. usr/lib/python3.6/site-packages/linkedin_scraper/functions.py +15 -0
  10. usr/lib/python3.6/site-packages/linkedin_scraper/objects.py +60 -0
  11. usr/lib/python3.6/site-packages/linkedin_scraper/person.py +145 -0
  12. usr/lib/python3.6/site-packages/linkedin_scraper/scraper.py +8 -0
  13. usr/lib/python3.6/site-packages/linkedin_scraper-2.1.0-py3.6.egg-info/PKG-INFO +13 -0
  14. usr/lib/python3.6/site-packages/linkedin_scraper-2.1.0-py3.6.egg-info/SOURCES.txt +14 -0
  15. usr/lib/python3.6/site-packages/linkedin_scraper-2.1.0-py3.6.egg-info/dependency_links.txt +1 -0
  16. usr/lib/python3.6/site-packages/linkedin_scraper-2.1.0-py3.6.egg-info/requires.txt +3 -0
  17. usr/lib/python3.6/site-packages/linkedin_scraper-2.1.0-py3.6.egg-info/top_level.txt +1 -0
  18. usr/lib/python3.6/site-packages/linkedin_user_scraper/__init__.py +8 -0
  19. usr/lib/python3.6/site-packages/linkedin_user_scraper/__pycache__/__init__.cpython-36.pyc +0 -0
  20. usr/lib/python3.6/site-packages/linkedin_user_scraper/__pycache__/company.cpython-36.pyc +0 -0
  21. usr/lib/python3.6/site-packages/linkedin_user_scraper/__pycache__/functions.cpython-36.pyc +0 -0
  22. usr/lib/python3.6/site-packages/linkedin_user_scraper/__pycache__/objects.cpython-36.pyc +0 -0
  23. usr/lib/python3.6/site-packages/linkedin_user_scraper/__pycache__/person.cpython-36.pyc +0 -0
  24. usr/lib/python3.6/site-packages/linkedin_user_scraper/__pycache__/scraper.cpython-36.pyc +0 -0
  25. usr/lib/python3.6/site-packages/linkedin_user_scraper/company.py +159 -0
  26. usr/lib/python3.6/site-packages/linkedin_user_scraper/functions.py +15 -0
  27. usr/lib/python3.6/site-packages/linkedin_user_scraper/objects.py +51 -0
  28. usr/lib/python3.6/site-packages/linkedin_user_scraper/person.py +87 -0
  29. usr/lib/python3.6/site-packages/linkedin_user_scraper/scraper.py +8 -0
@@ -0,0 +1,10 @@
1
+ from os.path import dirname, basename, isfile
2
+ from .person import Person
3
+ from .objects import Institution, Experience, Education
4
+ from .company import Company
5
+
6
+ __version__ = "2.1.0"
7
+
8
+ import glob
9
+ modules = glob.glob(dirname(__file__)+"/*.py")
10
+ __all__ = [ basename(f)[:-3] for f in modules if isfile(f) and not f.endswith('__init__.py')]
@@ -0,0 +1,213 @@
1
+ import requests
2
+ from lxml import html
3
+ from selenium import webdriver
4
+ from selenium.webdriver.common.by import By
5
+ from selenium.webdriver.support.ui import WebDriverWait
6
+ from selenium.webdriver.support import expected_conditions as EC
7
+ from .objects import Scraper
8
+ import os
9
+
10
+ class CompanySummary(object):
11
+ linkedin_url = None
12
+ name = None
13
+ followers = None
14
+
15
+ def __init__(self, linkedin_url = None, name = None, followers = None):
16
+ self.linkedin_url = linkedin_url
17
+ self.name = name
18
+ self.followers = followers
19
+
20
+ def __repr__(self):
21
+ if self.followers == None:
22
+ return """ {name} """.format(name = self.name)
23
+ else:
24
+ return """ {name} {followers} """.format(name = self.name, followers = self.followers)
25
+
26
+ class Company(Scraper):
27
+ linkedin_url = None
28
+ name = None
29
+ about_us =None
30
+ website = None
31
+ headquarters = None
32
+ founded = None
33
+ company_type = None
34
+ company_size = None
35
+ specialties = None
36
+ showcase_pages =[]
37
+ affiliated_companies = []
38
+
39
+ def __init__(self, linkedin_url = None, name = None, about_us =None, website = None, headquarters = None, founded = None, company_type = None, company_size = None, specialties = None, showcase_pages =[], affiliated_companies = [], driver = None, scrape = True):
40
+ self.linkedin_url = linkedin_url
41
+ self.name = name
42
+ self.about_us = about_us
43
+ self.website = website
44
+ self.headquarters = headquarters
45
+ self.founded = founded
46
+ self.company_type = company_type
47
+ self.company_size = company_size
48
+ self.specialties = specialties
49
+ self.showcase_pages = showcase_pages
50
+ self.affiliated_companies = affiliated_companies
51
+
52
+ if driver is None:
53
+ try:
54
+ if os.getenv("CHROMEDRIVER") == None:
55
+ driver_path = os.path.join(os.path.dirname(__file__), 'drivers/chromedriver')
56
+ else:
57
+ driver_path = os.getenv("CHROMEDRIVER")
58
+
59
+ driver = webdriver.Chrome(driver_path)
60
+ except:
61
+ driver = webdriver.Chrome()
62
+
63
+ driver.get(linkedin_url)
64
+ self.driver = driver
65
+
66
+ if scrape:
67
+ self.scrape()
68
+
69
+ def __get_text_under_subtitle(self, elem):
70
+ return "\n".join(elem.text.split("\n")[1:])
71
+
72
+ def __get_text_under_subtitle_by_class(self, driver, class_name):
73
+ return self.__get_text_under_subtitle(driver.find_element_by_class_name(class_name))
74
+
75
+ def scrape(self, close_on_complete = True):
76
+ if self.is_signed_in():
77
+ self.scrape_logged_in(close_on_complete = close_on_complete)
78
+ else:
79
+ self.scrape_not_logged_in(close_on_complete = close_on_complete)
80
+
81
+ def scrape_logged_in(self, close_on_complete = True):
82
+ driver = self.driver
83
+
84
+ self.name = driver.find_element_by_xpath('//h1[@dir="ltr"]').text
85
+ self.about_us = driver.find_element_by_class_name("org-about-us-organization-description__text").text
86
+
87
+ self.specialties = "\n".join(driver.find_element_by_class_name("org-about-company-module__specialities").text.split(", "))
88
+ self.website = driver.find_element_by_class_name("org-about-us-company-module__website").text
89
+ self.headquarters = driver.find_element_by_class_name("org-about-company-module__headquarters").text
90
+ self.industry = driver.find_element_by_class_name("company-industries").text
91
+ self.company_size = driver.find_element_by_class_name("org-about-company-module__company-staff-count-range").text
92
+
93
+ driver.execute_script("window.scrollTo(0, Math.ceil(document.body.scrollHeight/2));")
94
+
95
+
96
+ try:
97
+ _ = WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.CLASS_NAME, 'company-list')))
98
+ showcase, affiliated = driver.find_elements_by_class_name("company-list")
99
+ driver.find_element_by_id("org-related-companies-module__show-more-btn").click()
100
+
101
+ # get showcase
102
+ for showcase_company in showcase.find_elements_by_class_name("org-company-card"):
103
+ companySummary = CompanySummary(
104
+ linkedin_url = showcase_company.find_element_by_class_name("company-name-link").get_attribute("href"),
105
+ name = showcase_company.find_element_by_class_name("company-name-link").text,
106
+ followers = showcase_company.find_element_by_class_name("company-followers-count").text
107
+ )
108
+ self.showcase_pages.append(companySummary)
109
+
110
+ # affiliated company
111
+
112
+ for affiliated_company in showcase.find_elements_by_class_name("org-company-card"):
113
+ companySummary = CompanySummary(
114
+ linkedin_url = affiliated_company.find_element_by_class_name("company-name-link").get_attribute("href"),
115
+ name = affiliated_company.find_element_by_class_name("company-name-link").text,
116
+ followers = affiliated_company.find_element_by_class_name("company-followers-count").text
117
+ )
118
+ self.affiliated_companies.append(companySummary)
119
+
120
+ except:
121
+ pass
122
+
123
+
124
+ if close_on_complete:
125
+ driver.close()
126
+
127
+ def scrape_not_logged_in(self, close_on_complete = True, retry_limit = 10):
128
+ driver = self.driver
129
+ retry_times = 0
130
+ while self.is_signed_in() and retry_times <= retry_limit:
131
+ page = driver.get(self.linkedin_url)
132
+ retry_times = retry_times + 1
133
+
134
+ self.name = driver.find_element_by_class_name("name").text
135
+
136
+ self.about_us = driver.find_element_by_class_name("basic-info-description").text
137
+ self.specialties = self.__get_text_under_subtitle_by_class(driver, "specialties")
138
+ self.website = self.__get_text_under_subtitle_by_class(driver, "website")
139
+ self.headquarters = driver.find_element_by_class_name("adr").text
140
+ self.industry = driver.find_element_by_class_name("industry").text
141
+ self.company_size = driver.find_element_by_class_name("company-size").text
142
+ self.company_type = self.__get_text_under_subtitle_by_class(driver, "type")
143
+ self.founded = self.__get_text_under_subtitle_by_class(driver, "founded")
144
+
145
+ # get showcase
146
+ try:
147
+ driver.find_element_by_id("view-other-showcase-pages-dialog").click()
148
+ WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.ID, 'dialog')))
149
+
150
+ showcase_pages = driver.find_elements_by_class_name("company-showcase-pages")[1]
151
+ for showcase_company in showcase_pages.find_elements_by_tag_name("li"):
152
+ name_elem = showcase_company.find_element_by_class_name("name")
153
+ companySummary = CompanySummary(
154
+ linkedin_url = name_elem.find_element_by_tag_name("a").get_attribute("href"),
155
+ name = name_elem.text,
156
+ followers = showcase_company.text.split("\n")[1]
157
+ )
158
+ self.showcase_pages.append(companySummary)
159
+ driver.find_element_by_class_name("dialog-close").click()
160
+ except:
161
+ pass
162
+
163
+ # affiliated company
164
+ try:
165
+ affiliated_pages = driver.find_element_by_class_name("affiliated-companies")
166
+ for i, affiliated_page in enumerate(affiliated_pages.find_elements_by_class_name("affiliated-company-name")):
167
+ if i % 3 == 0:
168
+ affiliated_pages.find_element_by_class_name("carousel-control-next").click()
169
+
170
+ companySummary = CompanySummary(
171
+ linkedin_url = affiliated_page.find_element_by_tag_name("a").get_attribute("href"),
172
+ name = affiliated_page.text
173
+ )
174
+ self.affiliated_companies.append(companySummary)
175
+ except:
176
+ pass
177
+
178
+ if close_on_complete:
179
+ driver.close()
180
+
181
+ def __repr__(self):
182
+ return """
183
+ {name}
184
+
185
+ {about_us}
186
+
187
+ Specialties: {specialties}
188
+
189
+ Website: {website}
190
+ Industry: {industry}
191
+ Type: {company_type}
192
+ Headquarters: {headquarters}
193
+ Company Size: {company_size}
194
+ Founded: {founded}
195
+
196
+ Showcase Pages
197
+ {showcase_pages}
198
+
199
+ Affiliated Companies
200
+ {affiliated_companies}
201
+ """.format(
202
+ name = self.name,
203
+ about_us = self.about_us,
204
+ specialties = self.specialties,
205
+ website= self.website,
206
+ industry= self.industry,
207
+ company_type= self.company_type,
208
+ headquarters= self.headquarters,
209
+ company_size= self.company_size,
210
+ founded= self.founded,
211
+ showcase_pages = self.showcase_pages,
212
+ affiliated_companies = self.affiliated_companies
213
+ )
@@ -0,0 +1,15 @@
1
+ import re
2
+
3
+ def time_divide(string):
4
+ duration = re.search("\((.*?)\)", string)
5
+
6
+ if duration != None:
7
+ duration = duration.group(0)
8
+ string = string.replace(duration, "").strip()
9
+ else:
10
+ duration = "()"
11
+
12
+ times = string.split("–")
13
+ return (times[0].strip(), times[1].strip(), duration[1:-1])
14
+
15
+
@@ -0,0 +1,60 @@
1
+
2
+ class Institution(object):
3
+ institution_name = None
4
+ website = None
5
+ industry = None
6
+ type = None
7
+ headquarters = None
8
+ company_size = None
9
+ founded = None
10
+
11
+ def __init__(self, name=None, website=None, industry=None, type=None, headquarters=None, company_size=None, founded=None):
12
+ self.name = name
13
+ self.website = website
14
+ self.industry = industry
15
+ self.type = type
16
+ self.headquarters = headquarters
17
+ self.company_size = company_size
18
+ self.founded = founded
19
+
20
+ class Experience(Institution):
21
+ from_date = None
22
+ to_date = None
23
+ description = None
24
+ position_title = None
25
+
26
+ def __init__(self, from_date = None, to_date = None, description = None, position_title = None):
27
+ self.from_date = from_date
28
+ self.to_date = to_date
29
+ self.description = description
30
+ self.position_title = position_title
31
+
32
+ def __repr__(self):
33
+ return "{position_title} at {company} from {from_date} to {to_date}".format( from_date = self.from_date, to_date = self.to_date, position_title = self.position_title, company = self.institution_name)
34
+
35
+
36
+ class Education(Institution):
37
+ from_date = None
38
+ to_date = None
39
+ description = None
40
+ degree = None
41
+
42
+ def __init__(self, from_date = None, to_date = None, description = None, degree = None):
43
+ self.from_date = from_date
44
+ self.to_date = to_date
45
+ self.description = description
46
+ self.degree = degree
47
+
48
+ def __repr__(self):
49
+ return "{degree} at {company} from {from_date} to {to_date}".format( from_date = self.from_date, to_date = self.to_date, degree = self.degree, company = self.institution_name)
50
+
51
+ class Scraper(object):
52
+ driver = None
53
+
54
+ def is_signed_in(self):
55
+ try:
56
+ self.driver.find_element_by_id("profile-nav-item")
57
+ return True
58
+ except:
59
+ pass
60
+ return False
@@ -0,0 +1,145 @@
1
+ import requests
2
+ from lxml import html
3
+ from selenium import webdriver
4
+ from selenium.webdriver.common.by import By
5
+ from selenium.webdriver.support.ui import WebDriverWait
6
+ from selenium.webdriver.support import expected_conditions as EC
7
+ from .functions import time_divide
8
+ from .objects import Experience, Education, Scraper
9
+ import os
10
+
11
+ class Person(Scraper):
12
+ name = None
13
+ experiences = []
14
+ educations = []
15
+ also_viewed_urls = []
16
+ linkedin_url = None
17
+
18
+ def __init__(self, linkedin_url = None, experiences = [], educations = [], driver = None, scrape = True):
19
+ self.linkedin_url = linkedin_url
20
+ self.experiences = experiences
21
+ self.educations = educations
22
+
23
+ if driver is None:
24
+ try:
25
+ if os.getenv("CHROMEDRIVER") == None:
26
+ driver_path = os.path.join(os.path.dirname(__file__), 'drivers/chromedriver')
27
+ else:
28
+ driver_path = os.getenv("CHROMEDRIVER")
29
+
30
+ driver = webdriver.Chrome(driver_path)
31
+ except:
32
+ driver = webdriver.Chrome()
33
+
34
+ driver.get(linkedin_url)
35
+ self.driver = driver
36
+
37
+ if scrape:
38
+ self.scrape()
39
+
40
+
41
+ def add_experience(self, experience):
42
+ self.experiences.append(experience)
43
+
44
+ def add_education(self, education):
45
+ self.educations.append(education)
46
+
47
+ def scrape(self, close_on_complete = True):
48
+ if self.is_signed_in():
49
+ self.scrape_logged_in(close_on_complete = close_on_complete)
50
+ else:
51
+ self.scrape_not_logged_in(close_on_complete = close_on_complete)
52
+
53
+ def scrape_logged_in(self, close_on_complete = True):
54
+ driver = self.driver
55
+ self.name = driver.find_element_by_class_name("pv-top-card-section__name").text
56
+
57
+ driver.execute_script("window.scrollTo(0, Math.ceil(document.body.scrollHeight/2));")
58
+
59
+ _ = WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.ID, "experience-section")))
60
+
61
+ # get experience
62
+ exp = driver.find_element_by_id("experience-section")
63
+ for position in exp.find_elements_by_class_name("pv-position-entity"):
64
+ position_title = position.find_element_by_tag_name("h3").text
65
+ company = position.find_element_by_class_name("pv-entity__secondary-title").text
66
+
67
+ try:
68
+ times = position.find_element_by_class_name("pv-entity__date-range").text
69
+ from_date, to_date, duration = time_divide(times)
70
+ except:
71
+ from_date, to_date = (None, None)
72
+ experience = Experience( position_title = position_title , from_date = from_date , to_date = to_date)
73
+ experience.institution_name = company
74
+ self.add_experience(experience)
75
+
76
+ driver.execute_script("window.scrollTo(0, Math.ceil(document.body.scrollHeight/1.5));")
77
+
78
+ _ = WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.ID, "education-section")))
79
+
80
+ # get education
81
+ edu = driver.find_element_by_id("education-section")
82
+ for school in edu.find_elements_by_class_name("pv-profile-section__sortable-item"):
83
+ university = school.find_element_by_class_name("pv-entity__school-name").text
84
+ degree = school.find_element_by_class_name("pv-entity__degree-name").text
85
+ try:
86
+ times = school.find_element_by_class_name("pv-entity__dates").text
87
+ from_date, to_date, duration = time_divide(times)
88
+ except:
89
+ from_date, to_date = (None, None)
90
+ education = Education(from_date = from_date, to_date = to_date, degree=degree)
91
+ education.institution_name = university
92
+ self.add_education(education)
93
+
94
+ if close_on_complete:
95
+ driver.close()
96
+
97
+
98
+ def scrape_not_logged_in(self, close_on_complete=True, retry_limit = 10):
99
+ driver = self.driver
100
+ retry_times = 0
101
+ while self.is_signed_in() and retry_times <= retry_limit:
102
+ page = driver.get(self.linkedin_url)
103
+ retry_times = retry_times + 1
104
+
105
+
106
+ # get name
107
+ self.name = driver.find_element_by_id("name").text
108
+
109
+ # get experience
110
+ exp = driver.find_element_by_id("experience")
111
+ for position in exp.find_elements_by_class_name("position"):
112
+ position_title = position.find_element_by_class_name("item-title").text
113
+ company = position.find_element_by_class_name("item-subtitle").text
114
+
115
+ try:
116
+ times = position.find_element_by_class_name("date-range").text
117
+ from_date, to_date, duration = time_divide(times)
118
+ except:
119
+ from_date, to_date = (None, None)
120
+ experience = Experience( position_title = position_title , from_date = from_date , to_date = to_date)
121
+ experience.institution_name = company
122
+ self.add_experience(experience)
123
+
124
+ # get education
125
+ edu = driver.find_element_by_id("education")
126
+ for school in edu.find_elements_by_class_name("school"):
127
+ university = school.find_element_by_class_name("item-title").text
128
+ degree = school.find_element_by_class_name("original").text
129
+ try:
130
+ times = school.find_element_by_class_name("date-range").text
131
+ from_date, to_date, duration = time_divide(times)
132
+ except:
133
+ from_date, to_date = (None, None)
134
+ education = Education(from_date = from_date, to_date = to_date, degree=degree)
135
+ education.institution_name = university
136
+ self.add_education(education)
137
+
138
+ # get
139
+ if close_on_complete:
140
+ driver.close()
141
+
142
+ def __repr__(self):
143
+ return "{name}\n\nExperience\n{exp}\n\nEducation\n{edu}".format(name = self.name, exp = self.experiences, edu = self.educations)
144
+
145
+
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/python3
2
+
3
+ import requests
4
+ from lxml import html
5
+ from selenium import webdriver
6
+ import re
7
+ import os
8
+
@@ -0,0 +1,13 @@
1
+ Metadata-Version: 1.1
2
+ Name: linkedin-scraper
3
+ Version: 2.1.0
4
+ Summary: Scrapes user data from Linkedin
5
+ Home-page: https://github.com/joeyism/linkedin_scraper
6
+ Author: Joey Sham
7
+ Author-email: sham.joey@gmail.com
8
+ License: UNKNOWN
9
+ Download-URL: https://github.com/joeyism/linkedin_scraper/dist/2.1.0.tar.gz
10
+ Description-Content-Type: UNKNOWN
11
+ Description: UNKNOWN
12
+ Keywords: linkedin,scraping,scraper
13
+ Platform: UNKNOWN
@@ -0,0 +1,14 @@
1
+ README.rst
2
+ setup.cfg
3
+ setup.py
4
+ linkedin_scraper/__init__.py
5
+ linkedin_scraper/company.py
6
+ linkedin_scraper/functions.py
7
+ linkedin_scraper/objects.py
8
+ linkedin_scraper/person.py
9
+ linkedin_scraper/scraper.py
10
+ linkedin_scraper.egg-info/PKG-INFO
11
+ linkedin_scraper.egg-info/SOURCES.txt
12
+ linkedin_scraper.egg-info/dependency_links.txt
13
+ linkedin_scraper.egg-info/requires.txt
14
+ linkedin_scraper.egg-info/top_level.txt
@@ -0,0 +1,8 @@
1
+ from os.path import dirname, basename, isfile
2
+ from .person import Person
3
+ from .objects import Institution, Experience, Education
4
+ from .company import Company
5
+
6
+ import glob
7
+ modules = glob.glob(dirname(__file__)+"/*.py")
8
+ __all__ = [ basename(f)[:-3] for f in modules if isfile(f) and not f.endswith('__init__.py')]
@@ -0,0 +1,159 @@
1
+ import requests
2
+ from lxml import html
3
+ from selenium import webdriver
4
+ from selenium.webdriver.common.by import By
5
+ from selenium.webdriver.support.ui import WebDriverWait
6
+ from selenium.webdriver.support import expected_conditions as EC
7
+ import os
8
+
9
+ class CompanySummary(object):
10
+ linkedin_url = None
11
+ name = None
12
+ followers = None
13
+
14
+ def __init__(self, linkedin_url = None, name = None, followers = None):
15
+ self.linkedin_url = linkedin_url
16
+ self.name = name
17
+ self.followers = followers
18
+
19
+ def __repr__(self):
20
+ if self.followers == None:
21
+ return """ {name} """.format(name = self.name)
22
+ else:
23
+ return """ {name} {followers} """.format(name = self.name, followers = self.followers)
24
+
25
+ class Company(object):
26
+ linkedin_url = None
27
+ name = None
28
+ about_us =None
29
+ website = None
30
+ headquarters = None
31
+ founded = None
32
+ company_type = None
33
+ company_size = None
34
+ specialties = None
35
+ showcase_pages =[]
36
+ affiliated_companies = []
37
+ driver = None
38
+
39
+ def __init__(self, linkedin_url = None, name = None, about_us =None, website = None, headquarters = None, founded = None, company_type = None, company_size = None, specialties = None, showcase_pages =[], affiliated_companies = [], driver = None, scrape = True):
40
+ self.linkedin_url = linkedin_url
41
+ self.name = name
42
+ self.about_us = about_us
43
+ self.website = website
44
+ self.headquarters = headquarters
45
+ self.founded = founded
46
+ self.company_type = company_type
47
+ self.company_size = company_size
48
+ self.specialties = specialties
49
+ self.showcase_pages = showcase_pages
50
+ self.affiliated_companies = affiliated_companies
51
+
52
+ if driver is None:
53
+ try:
54
+ if os.getenv("CHROMEDRIVER") == None:
55
+ driver_path = os.path.join(os.path.dirname(__file__), 'drivers/chromedriver')
56
+ else:
57
+ driver_path = os.getenv("CHROMEDRIVER")
58
+
59
+ driver = webdriver.Chrome(driver_path)
60
+ except:
61
+ driver = webdriver.Chrome()
62
+
63
+ driver.get(linkedin_url)
64
+ self.driver = driver
65
+
66
+ if scrape:
67
+ self.scrape()
68
+
69
+ def __get_text_under_subtitle(self, elem):
70
+ return "\n".join(elem.text.split("\n")[1:])
71
+
72
+ def __get_text_under_subtitle_by_class(self, driver, class_name):
73
+ return self.__get_text_under_subtitle(driver.find_element_by_class_name(class_name))
74
+
75
+
76
+ def scrape(self, close_on_complete = True):
77
+ driver = self.driver
78
+ page = driver.get(self.linkedin_url)
79
+
80
+ self.name = driver.find_element_by_class_name("name").text
81
+
82
+ self.about_us = driver.find_element_by_class_name("basic-info-description").text
83
+ self.specialties = self.__get_text_under_subtitle_by_class(driver, "specialties")
84
+ self.website = self.__get_text_under_subtitle_by_class(driver, "website")
85
+ self.headquarters = driver.find_element_by_class_name("adr").text
86
+ self.industry = driver.find_element_by_class_name("industry").text
87
+ self.company_size = driver.find_element_by_class_name("company-size").text
88
+ self.company_type = self.__get_text_under_subtitle_by_class(driver, "type")
89
+ self.founded = self.__get_text_under_subtitle_by_class(driver, "founded")
90
+
91
+ # get showcase
92
+ try:
93
+ driver.find_element_by_id("view-other-showcase-pages-dialog").click()
94
+ WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.ID, 'dialog')))
95
+
96
+ showcase_pages = driver.find_elements_by_class_name("company-showcase-pages")[1]
97
+ for showcase_company in showcase_pages.find_elements_by_tag_name("li"):
98
+ name_elem = showcase_company.find_element_by_class_name("name")
99
+ companySummary = CompanySummary(
100
+ linkedin_url = name_elem.find_element_by_tag_name("a").get_attribute("href"),
101
+ name = name_elem.text,
102
+ followers = showcase_company.text.split("\n")[1]
103
+ )
104
+ self.showcase_pages.append(companySummary)
105
+ driver.find_element_by_class_name("dialog-close").click()
106
+ except:
107
+ pass
108
+
109
+ # affiliated company
110
+ try:
111
+ affiliated_pages = driver.find_element_by_class_name("affiliated-companies")
112
+ for i, affiliated_page in enumerate(affiliated_pages.find_elements_by_class_name("affiliated-company-name")):
113
+ if i % 3 == 0:
114
+ affiliated_pages.find_element_by_class_name("carousel-control-next").click()
115
+
116
+ companySummary = CompanySummary(
117
+ linkedin_url = affiliated_page.find_element_by_tag_name("a").get_attribute("href"),
118
+ name = affiliated_page.text
119
+ )
120
+ self.affiliated_companies.append(companySummary)
121
+ except:
122
+ pass
123
+
124
+ if close_on_complete:
125
+ driver.close()
126
+
127
+ def __repr__(self):
128
+ return """
129
+ {name}
130
+
131
+ {about_us}
132
+
133
+ Specialties: {specialties}
134
+
135
+ Website: {website}
136
+ Industry: {industry}
137
+ Type: {company_type}
138
+ Headquarters: {headquarters}
139
+ Company Size: {company_size}
140
+ Founded: {founded}
141
+
142
+ Showcase Pages
143
+ {showcase_pages}
144
+
145
+ Affiliated Companies
146
+ {affiliated_companies}
147
+ """.format(
148
+ name = self.name,
149
+ about_us = self.about_us,
150
+ specialties = self.specialties,
151
+ website= self.website,
152
+ industry= self.industry,
153
+ company_type= self.company_type,
154
+ headquarters= self.headquarters,
155
+ company_size= self.company_size,
156
+ founded= self.founded,
157
+ showcase_pages = self.showcase_pages,
158
+ affiliated_companies = self.affiliated_companies
159
+ )
@@ -0,0 +1,15 @@
1
+ import re
2
+
3
+ def time_divide(string):
4
+ duration = re.search("\((.*?)\)", string)
5
+
6
+ if duration != None:
7
+ duration = duration.group(0)
8
+ string = string.replace(duration, "").strip()
9
+ else:
10
+ duration = "()"
11
+
12
+ times = string.split("–")
13
+ return (times[0].strip(), times[1].strip(), duration[1:-1])
14
+
15
+
@@ -0,0 +1,51 @@
1
+
2
+ class Institution(object):
3
+ institution_name = None
4
+ website = None
5
+ industry = None
6
+ type = None
7
+ headquarters = None
8
+ company_size = None
9
+ founded = None
10
+
11
+ def __init__(self, name=None, website=None, industry=None, type=None, headquarters=None, company_size=None, founded=None):
12
+ self.name = name
13
+ self.website = website
14
+ self.industry = industry
15
+ self.type = type
16
+ self.headquarters = headquarters
17
+ self.company_size = company_size
18
+ self.founded = founded
19
+
20
+ class Experience(Institution):
21
+ from_date = None
22
+ to_date = None
23
+ description = None
24
+ position_title = None
25
+
26
+ def __init__(self, from_date = None, to_date = None, description = None, position_title = None):
27
+ self.from_date = from_date
28
+ self.to_date = to_date
29
+ self.description = description
30
+ self.position_title = position_title
31
+
32
+ def __repr__(self):
33
+ return "{position_title} at {company} from {from_date} to {to_date}".format( from_date = self.from_date, to_date = self.to_date, position_title = self.position_title, company = self.institution_name)
34
+
35
+
36
+ class Education(Institution):
37
+ from_date = None
38
+ to_date = None
39
+ description = None
40
+ degree = None
41
+
42
+ def __init__(self, from_date = None, to_date = None, description = None, degree = None):
43
+ self.from_date = from_date
44
+ self.to_date = to_date
45
+ self.description = description
46
+ self.degree = degree
47
+
48
+ def __repr__(self):
49
+ return "{degree} at {company} from {from_date} to {to_date}".format( from_date = self.from_date, to_date = self.to_date, degree = self.degree, company = self.institution_name)
50
+
51
+
@@ -0,0 +1,87 @@
1
+ import requests
2
+ from lxml import html
3
+ from selenium import webdriver
4
+ from .functions import time_divide
5
+ import os
6
+
7
+ class Person(object):
8
+ name = None
9
+ experiences = []
10
+ educations = []
11
+ also_viewed_urls = []
12
+ linkedin_url = None
13
+ driver = None
14
+
15
+ def __init__(self, linkedin_url = None, experiences = [], educations = [], driver = None, scrape = True):
16
+ self.linkedin_url = linkedin_url
17
+ self.experiences = experiences
18
+ self.educations = educations
19
+
20
+ if driver is None:
21
+ try:
22
+ if os.getenv("CHROMEDRIVER") == None:
23
+ driver_path = os.path.join(os.path.dirname(__file__), 'drivers/chromedriver')
24
+ else:
25
+ driver_path = os.getenv("CHROMEDRIVER")
26
+
27
+ driver = webdriver.Chrome(driver_path)
28
+ except:
29
+ driver = webdriver.Chrome()
30
+
31
+ driver.get(linkedin_url)
32
+ self.driver = driver
33
+
34
+ if scrape:
35
+ self.scrape()
36
+
37
+
38
+ def add_experience(self, experience):
39
+ self.experiences.append(experience)
40
+
41
+ def add_education(self, education):
42
+ self.educations.append(education)
43
+
44
+ def scrape(self, close_on_complete=True):
45
+ driver = self.driver
46
+ page = driver.get(self.linkedin_url)
47
+
48
+ # get name
49
+ self.name = driver.find_element_by_id("name").text
50
+
51
+ # get experience
52
+ exp = driver.find_element_by_id("experience")
53
+ for position in exp.find_elements_by_class_name("position"):
54
+ position_title = position.find_element_by_class_name("item-title").text
55
+ company = position.find_element_by_class_name("item-subtitle").text
56
+
57
+ try:
58
+ times = position.find_element_by_class_name("date-range").text
59
+ from_date, to_date, duration = time_divide(times)
60
+ except:
61
+ from_date, to_date = (None, None)
62
+ experience = Experience( position_title = position_title , from_date = from_date , to_date = to_date)
63
+ experience.institution_name = company
64
+ self.add_experience(experience)
65
+
66
+ # get education
67
+ edu = driver.find_element_by_id("education")
68
+ for school in edu.find_elements_by_class_name("school"):
69
+ university = school.find_element_by_class_name("item-title").text
70
+ degree = school.find_element_by_class_name("original").text
71
+ try:
72
+ times = school.find_element_by_class_name("date-range").text
73
+ from_date, to_date, duration = time_divide(times)
74
+ except:
75
+ from_date, to_date = (None, None)
76
+ education = Education(from_date = from_date, to_date = to_date, degree=degree)
77
+ education.institution_name = university
78
+ self.add_education(education)
79
+
80
+ # get
81
+ if close_on_complete:
82
+ driver.close()
83
+
84
+ def __repr__(self):
85
+ return "{name}\n\nExperience\n{exp}\n\nEducation\n{edu}".format(name = self.name, exp = self.experiences, edu = self.educations)
86
+
87
+
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/python3
2
+
3
+ import requests
4
+ from lxml import html
5
+ from selenium import webdriver
6
+ import re
7
+ import os
8
+