linkedin-scraper 2.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- usr/lib/python3.6/site-packages/linkedin_scraper/__init__.py +10 -0
- usr/lib/python3.6/site-packages/linkedin_scraper/__pycache__/__init__.cpython-36.pyc +0 -0
- usr/lib/python3.6/site-packages/linkedin_scraper/__pycache__/company.cpython-36.pyc +0 -0
- usr/lib/python3.6/site-packages/linkedin_scraper/__pycache__/functions.cpython-36.pyc +0 -0
- usr/lib/python3.6/site-packages/linkedin_scraper/__pycache__/objects.cpython-36.pyc +0 -0
- usr/lib/python3.6/site-packages/linkedin_scraper/__pycache__/person.cpython-36.pyc +0 -0
- usr/lib/python3.6/site-packages/linkedin_scraper/__pycache__/scraper.cpython-36.pyc +0 -0
- usr/lib/python3.6/site-packages/linkedin_scraper/company.py +213 -0
- usr/lib/python3.6/site-packages/linkedin_scraper/functions.py +15 -0
- usr/lib/python3.6/site-packages/linkedin_scraper/objects.py +60 -0
- usr/lib/python3.6/site-packages/linkedin_scraper/person.py +145 -0
- usr/lib/python3.6/site-packages/linkedin_scraper/scraper.py +8 -0
- usr/lib/python3.6/site-packages/linkedin_scraper-2.1.0-py3.6.egg-info/PKG-INFO +13 -0
- usr/lib/python3.6/site-packages/linkedin_scraper-2.1.0-py3.6.egg-info/SOURCES.txt +14 -0
- usr/lib/python3.6/site-packages/linkedin_scraper-2.1.0-py3.6.egg-info/dependency_links.txt +1 -0
- usr/lib/python3.6/site-packages/linkedin_scraper-2.1.0-py3.6.egg-info/requires.txt +3 -0
- usr/lib/python3.6/site-packages/linkedin_scraper-2.1.0-py3.6.egg-info/top_level.txt +1 -0
- usr/lib/python3.6/site-packages/linkedin_user_scraper/__init__.py +8 -0
- usr/lib/python3.6/site-packages/linkedin_user_scraper/__pycache__/__init__.cpython-36.pyc +0 -0
- usr/lib/python3.6/site-packages/linkedin_user_scraper/__pycache__/company.cpython-36.pyc +0 -0
- usr/lib/python3.6/site-packages/linkedin_user_scraper/__pycache__/functions.cpython-36.pyc +0 -0
- usr/lib/python3.6/site-packages/linkedin_user_scraper/__pycache__/objects.cpython-36.pyc +0 -0
- usr/lib/python3.6/site-packages/linkedin_user_scraper/__pycache__/person.cpython-36.pyc +0 -0
- usr/lib/python3.6/site-packages/linkedin_user_scraper/__pycache__/scraper.cpython-36.pyc +0 -0
- usr/lib/python3.6/site-packages/linkedin_user_scraper/company.py +159 -0
- usr/lib/python3.6/site-packages/linkedin_user_scraper/functions.py +15 -0
- usr/lib/python3.6/site-packages/linkedin_user_scraper/objects.py +51 -0
- usr/lib/python3.6/site-packages/linkedin_user_scraper/person.py +87 -0
- usr/lib/python3.6/site-packages/linkedin_user_scraper/scraper.py +8 -0
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
from os.path import dirname, basename, isfile
|
|
2
|
+
from .person import Person
|
|
3
|
+
from .objects import Institution, Experience, Education
|
|
4
|
+
from .company import Company
|
|
5
|
+
|
|
6
|
+
__version__ = "2.1.0"
|
|
7
|
+
|
|
8
|
+
import glob
|
|
9
|
+
modules = glob.glob(dirname(__file__)+"/*.py")
|
|
10
|
+
__all__ = [ basename(f)[:-3] for f in modules if isfile(f) and not f.endswith('__init__.py')]
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
from lxml import html
|
|
3
|
+
from selenium import webdriver
|
|
4
|
+
from selenium.webdriver.common.by import By
|
|
5
|
+
from selenium.webdriver.support.ui import WebDriverWait
|
|
6
|
+
from selenium.webdriver.support import expected_conditions as EC
|
|
7
|
+
from .objects import Scraper
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
class CompanySummary(object):
|
|
11
|
+
linkedin_url = None
|
|
12
|
+
name = None
|
|
13
|
+
followers = None
|
|
14
|
+
|
|
15
|
+
def __init__(self, linkedin_url = None, name = None, followers = None):
|
|
16
|
+
self.linkedin_url = linkedin_url
|
|
17
|
+
self.name = name
|
|
18
|
+
self.followers = followers
|
|
19
|
+
|
|
20
|
+
def __repr__(self):
|
|
21
|
+
if self.followers == None:
|
|
22
|
+
return """ {name} """.format(name = self.name)
|
|
23
|
+
else:
|
|
24
|
+
return """ {name} {followers} """.format(name = self.name, followers = self.followers)
|
|
25
|
+
|
|
26
|
+
class Company(Scraper):
|
|
27
|
+
linkedin_url = None
|
|
28
|
+
name = None
|
|
29
|
+
about_us =None
|
|
30
|
+
website = None
|
|
31
|
+
headquarters = None
|
|
32
|
+
founded = None
|
|
33
|
+
company_type = None
|
|
34
|
+
company_size = None
|
|
35
|
+
specialties = None
|
|
36
|
+
showcase_pages =[]
|
|
37
|
+
affiliated_companies = []
|
|
38
|
+
|
|
39
|
+
def __init__(self, linkedin_url = None, name = None, about_us =None, website = None, headquarters = None, founded = None, company_type = None, company_size = None, specialties = None, showcase_pages =[], affiliated_companies = [], driver = None, scrape = True):
|
|
40
|
+
self.linkedin_url = linkedin_url
|
|
41
|
+
self.name = name
|
|
42
|
+
self.about_us = about_us
|
|
43
|
+
self.website = website
|
|
44
|
+
self.headquarters = headquarters
|
|
45
|
+
self.founded = founded
|
|
46
|
+
self.company_type = company_type
|
|
47
|
+
self.company_size = company_size
|
|
48
|
+
self.specialties = specialties
|
|
49
|
+
self.showcase_pages = showcase_pages
|
|
50
|
+
self.affiliated_companies = affiliated_companies
|
|
51
|
+
|
|
52
|
+
if driver is None:
|
|
53
|
+
try:
|
|
54
|
+
if os.getenv("CHROMEDRIVER") == None:
|
|
55
|
+
driver_path = os.path.join(os.path.dirname(__file__), 'drivers/chromedriver')
|
|
56
|
+
else:
|
|
57
|
+
driver_path = os.getenv("CHROMEDRIVER")
|
|
58
|
+
|
|
59
|
+
driver = webdriver.Chrome(driver_path)
|
|
60
|
+
except:
|
|
61
|
+
driver = webdriver.Chrome()
|
|
62
|
+
|
|
63
|
+
driver.get(linkedin_url)
|
|
64
|
+
self.driver = driver
|
|
65
|
+
|
|
66
|
+
if scrape:
|
|
67
|
+
self.scrape()
|
|
68
|
+
|
|
69
|
+
def __get_text_under_subtitle(self, elem):
|
|
70
|
+
return "\n".join(elem.text.split("\n")[1:])
|
|
71
|
+
|
|
72
|
+
def __get_text_under_subtitle_by_class(self, driver, class_name):
|
|
73
|
+
return self.__get_text_under_subtitle(driver.find_element_by_class_name(class_name))
|
|
74
|
+
|
|
75
|
+
def scrape(self, close_on_complete = True):
|
|
76
|
+
if self.is_signed_in():
|
|
77
|
+
self.scrape_logged_in(close_on_complete = close_on_complete)
|
|
78
|
+
else:
|
|
79
|
+
self.scrape_not_logged_in(close_on_complete = close_on_complete)
|
|
80
|
+
|
|
81
|
+
def scrape_logged_in(self, close_on_complete = True):
|
|
82
|
+
driver = self.driver
|
|
83
|
+
|
|
84
|
+
self.name = driver.find_element_by_xpath('//h1[@dir="ltr"]').text
|
|
85
|
+
self.about_us = driver.find_element_by_class_name("org-about-us-organization-description__text").text
|
|
86
|
+
|
|
87
|
+
self.specialties = "\n".join(driver.find_element_by_class_name("org-about-company-module__specialities").text.split(", "))
|
|
88
|
+
self.website = driver.find_element_by_class_name("org-about-us-company-module__website").text
|
|
89
|
+
self.headquarters = driver.find_element_by_class_name("org-about-company-module__headquarters").text
|
|
90
|
+
self.industry = driver.find_element_by_class_name("company-industries").text
|
|
91
|
+
self.company_size = driver.find_element_by_class_name("org-about-company-module__company-staff-count-range").text
|
|
92
|
+
|
|
93
|
+
driver.execute_script("window.scrollTo(0, Math.ceil(document.body.scrollHeight/2));")
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
try:
|
|
97
|
+
_ = WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.CLASS_NAME, 'company-list')))
|
|
98
|
+
showcase, affiliated = driver.find_elements_by_class_name("company-list")
|
|
99
|
+
driver.find_element_by_id("org-related-companies-module__show-more-btn").click()
|
|
100
|
+
|
|
101
|
+
# get showcase
|
|
102
|
+
for showcase_company in showcase.find_elements_by_class_name("org-company-card"):
|
|
103
|
+
companySummary = CompanySummary(
|
|
104
|
+
linkedin_url = showcase_company.find_element_by_class_name("company-name-link").get_attribute("href"),
|
|
105
|
+
name = showcase_company.find_element_by_class_name("company-name-link").text,
|
|
106
|
+
followers = showcase_company.find_element_by_class_name("company-followers-count").text
|
|
107
|
+
)
|
|
108
|
+
self.showcase_pages.append(companySummary)
|
|
109
|
+
|
|
110
|
+
# affiliated company
|
|
111
|
+
|
|
112
|
+
for affiliated_company in showcase.find_elements_by_class_name("org-company-card"):
|
|
113
|
+
companySummary = CompanySummary(
|
|
114
|
+
linkedin_url = affiliated_company.find_element_by_class_name("company-name-link").get_attribute("href"),
|
|
115
|
+
name = affiliated_company.find_element_by_class_name("company-name-link").text,
|
|
116
|
+
followers = affiliated_company.find_element_by_class_name("company-followers-count").text
|
|
117
|
+
)
|
|
118
|
+
self.affiliated_companies.append(companySummary)
|
|
119
|
+
|
|
120
|
+
except:
|
|
121
|
+
pass
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
if close_on_complete:
|
|
125
|
+
driver.close()
|
|
126
|
+
|
|
127
|
+
def scrape_not_logged_in(self, close_on_complete = True, retry_limit = 10):
|
|
128
|
+
driver = self.driver
|
|
129
|
+
retry_times = 0
|
|
130
|
+
while self.is_signed_in() and retry_times <= retry_limit:
|
|
131
|
+
page = driver.get(self.linkedin_url)
|
|
132
|
+
retry_times = retry_times + 1
|
|
133
|
+
|
|
134
|
+
self.name = driver.find_element_by_class_name("name").text
|
|
135
|
+
|
|
136
|
+
self.about_us = driver.find_element_by_class_name("basic-info-description").text
|
|
137
|
+
self.specialties = self.__get_text_under_subtitle_by_class(driver, "specialties")
|
|
138
|
+
self.website = self.__get_text_under_subtitle_by_class(driver, "website")
|
|
139
|
+
self.headquarters = driver.find_element_by_class_name("adr").text
|
|
140
|
+
self.industry = driver.find_element_by_class_name("industry").text
|
|
141
|
+
self.company_size = driver.find_element_by_class_name("company-size").text
|
|
142
|
+
self.company_type = self.__get_text_under_subtitle_by_class(driver, "type")
|
|
143
|
+
self.founded = self.__get_text_under_subtitle_by_class(driver, "founded")
|
|
144
|
+
|
|
145
|
+
# get showcase
|
|
146
|
+
try:
|
|
147
|
+
driver.find_element_by_id("view-other-showcase-pages-dialog").click()
|
|
148
|
+
WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.ID, 'dialog')))
|
|
149
|
+
|
|
150
|
+
showcase_pages = driver.find_elements_by_class_name("company-showcase-pages")[1]
|
|
151
|
+
for showcase_company in showcase_pages.find_elements_by_tag_name("li"):
|
|
152
|
+
name_elem = showcase_company.find_element_by_class_name("name")
|
|
153
|
+
companySummary = CompanySummary(
|
|
154
|
+
linkedin_url = name_elem.find_element_by_tag_name("a").get_attribute("href"),
|
|
155
|
+
name = name_elem.text,
|
|
156
|
+
followers = showcase_company.text.split("\n")[1]
|
|
157
|
+
)
|
|
158
|
+
self.showcase_pages.append(companySummary)
|
|
159
|
+
driver.find_element_by_class_name("dialog-close").click()
|
|
160
|
+
except:
|
|
161
|
+
pass
|
|
162
|
+
|
|
163
|
+
# affiliated company
|
|
164
|
+
try:
|
|
165
|
+
affiliated_pages = driver.find_element_by_class_name("affiliated-companies")
|
|
166
|
+
for i, affiliated_page in enumerate(affiliated_pages.find_elements_by_class_name("affiliated-company-name")):
|
|
167
|
+
if i % 3 == 0:
|
|
168
|
+
affiliated_pages.find_element_by_class_name("carousel-control-next").click()
|
|
169
|
+
|
|
170
|
+
companySummary = CompanySummary(
|
|
171
|
+
linkedin_url = affiliated_page.find_element_by_tag_name("a").get_attribute("href"),
|
|
172
|
+
name = affiliated_page.text
|
|
173
|
+
)
|
|
174
|
+
self.affiliated_companies.append(companySummary)
|
|
175
|
+
except:
|
|
176
|
+
pass
|
|
177
|
+
|
|
178
|
+
if close_on_complete:
|
|
179
|
+
driver.close()
|
|
180
|
+
|
|
181
|
+
def __repr__(self):
|
|
182
|
+
return """
|
|
183
|
+
{name}
|
|
184
|
+
|
|
185
|
+
{about_us}
|
|
186
|
+
|
|
187
|
+
Specialties: {specialties}
|
|
188
|
+
|
|
189
|
+
Website: {website}
|
|
190
|
+
Industry: {industry}
|
|
191
|
+
Type: {company_type}
|
|
192
|
+
Headquarters: {headquarters}
|
|
193
|
+
Company Size: {company_size}
|
|
194
|
+
Founded: {founded}
|
|
195
|
+
|
|
196
|
+
Showcase Pages
|
|
197
|
+
{showcase_pages}
|
|
198
|
+
|
|
199
|
+
Affiliated Companies
|
|
200
|
+
{affiliated_companies}
|
|
201
|
+
""".format(
|
|
202
|
+
name = self.name,
|
|
203
|
+
about_us = self.about_us,
|
|
204
|
+
specialties = self.specialties,
|
|
205
|
+
website= self.website,
|
|
206
|
+
industry= self.industry,
|
|
207
|
+
company_type= self.company_type,
|
|
208
|
+
headquarters= self.headquarters,
|
|
209
|
+
company_size= self.company_size,
|
|
210
|
+
founded= self.founded,
|
|
211
|
+
showcase_pages = self.showcase_pages,
|
|
212
|
+
affiliated_companies = self.affiliated_companies
|
|
213
|
+
)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
def time_divide(string):
|
|
4
|
+
duration = re.search("\((.*?)\)", string)
|
|
5
|
+
|
|
6
|
+
if duration != None:
|
|
7
|
+
duration = duration.group(0)
|
|
8
|
+
string = string.replace(duration, "").strip()
|
|
9
|
+
else:
|
|
10
|
+
duration = "()"
|
|
11
|
+
|
|
12
|
+
times = string.split("–")
|
|
13
|
+
return (times[0].strip(), times[1].strip(), duration[1:-1])
|
|
14
|
+
|
|
15
|
+
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
|
|
2
|
+
class Institution(object):
|
|
3
|
+
institution_name = None
|
|
4
|
+
website = None
|
|
5
|
+
industry = None
|
|
6
|
+
type = None
|
|
7
|
+
headquarters = None
|
|
8
|
+
company_size = None
|
|
9
|
+
founded = None
|
|
10
|
+
|
|
11
|
+
def __init__(self, name=None, website=None, industry=None, type=None, headquarters=None, company_size=None, founded=None):
|
|
12
|
+
self.name = name
|
|
13
|
+
self.website = website
|
|
14
|
+
self.industry = industry
|
|
15
|
+
self.type = type
|
|
16
|
+
self.headquarters = headquarters
|
|
17
|
+
self.company_size = company_size
|
|
18
|
+
self.founded = founded
|
|
19
|
+
|
|
20
|
+
class Experience(Institution):
|
|
21
|
+
from_date = None
|
|
22
|
+
to_date = None
|
|
23
|
+
description = None
|
|
24
|
+
position_title = None
|
|
25
|
+
|
|
26
|
+
def __init__(self, from_date = None, to_date = None, description = None, position_title = None):
|
|
27
|
+
self.from_date = from_date
|
|
28
|
+
self.to_date = to_date
|
|
29
|
+
self.description = description
|
|
30
|
+
self.position_title = position_title
|
|
31
|
+
|
|
32
|
+
def __repr__(self):
|
|
33
|
+
return "{position_title} at {company} from {from_date} to {to_date}".format( from_date = self.from_date, to_date = self.to_date, position_title = self.position_title, company = self.institution_name)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class Education(Institution):
|
|
37
|
+
from_date = None
|
|
38
|
+
to_date = None
|
|
39
|
+
description = None
|
|
40
|
+
degree = None
|
|
41
|
+
|
|
42
|
+
def __init__(self, from_date = None, to_date = None, description = None, degree = None):
|
|
43
|
+
self.from_date = from_date
|
|
44
|
+
self.to_date = to_date
|
|
45
|
+
self.description = description
|
|
46
|
+
self.degree = degree
|
|
47
|
+
|
|
48
|
+
def __repr__(self):
|
|
49
|
+
return "{degree} at {company} from {from_date} to {to_date}".format( from_date = self.from_date, to_date = self.to_date, degree = self.degree, company = self.institution_name)
|
|
50
|
+
|
|
51
|
+
class Scraper(object):
|
|
52
|
+
driver = None
|
|
53
|
+
|
|
54
|
+
def is_signed_in(self):
|
|
55
|
+
try:
|
|
56
|
+
self.driver.find_element_by_id("profile-nav-item")
|
|
57
|
+
return True
|
|
58
|
+
except:
|
|
59
|
+
pass
|
|
60
|
+
return False
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
from lxml import html
|
|
3
|
+
from selenium import webdriver
|
|
4
|
+
from selenium.webdriver.common.by import By
|
|
5
|
+
from selenium.webdriver.support.ui import WebDriverWait
|
|
6
|
+
from selenium.webdriver.support import expected_conditions as EC
|
|
7
|
+
from .functions import time_divide
|
|
8
|
+
from .objects import Experience, Education, Scraper
|
|
9
|
+
import os
|
|
10
|
+
|
|
11
|
+
class Person(Scraper):
|
|
12
|
+
name = None
|
|
13
|
+
experiences = []
|
|
14
|
+
educations = []
|
|
15
|
+
also_viewed_urls = []
|
|
16
|
+
linkedin_url = None
|
|
17
|
+
|
|
18
|
+
def __init__(self, linkedin_url = None, experiences = [], educations = [], driver = None, scrape = True):
|
|
19
|
+
self.linkedin_url = linkedin_url
|
|
20
|
+
self.experiences = experiences
|
|
21
|
+
self.educations = educations
|
|
22
|
+
|
|
23
|
+
if driver is None:
|
|
24
|
+
try:
|
|
25
|
+
if os.getenv("CHROMEDRIVER") == None:
|
|
26
|
+
driver_path = os.path.join(os.path.dirname(__file__), 'drivers/chromedriver')
|
|
27
|
+
else:
|
|
28
|
+
driver_path = os.getenv("CHROMEDRIVER")
|
|
29
|
+
|
|
30
|
+
driver = webdriver.Chrome(driver_path)
|
|
31
|
+
except:
|
|
32
|
+
driver = webdriver.Chrome()
|
|
33
|
+
|
|
34
|
+
driver.get(linkedin_url)
|
|
35
|
+
self.driver = driver
|
|
36
|
+
|
|
37
|
+
if scrape:
|
|
38
|
+
self.scrape()
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def add_experience(self, experience):
|
|
42
|
+
self.experiences.append(experience)
|
|
43
|
+
|
|
44
|
+
def add_education(self, education):
|
|
45
|
+
self.educations.append(education)
|
|
46
|
+
|
|
47
|
+
def scrape(self, close_on_complete = True):
|
|
48
|
+
if self.is_signed_in():
|
|
49
|
+
self.scrape_logged_in(close_on_complete = close_on_complete)
|
|
50
|
+
else:
|
|
51
|
+
self.scrape_not_logged_in(close_on_complete = close_on_complete)
|
|
52
|
+
|
|
53
|
+
def scrape_logged_in(self, close_on_complete = True):
|
|
54
|
+
driver = self.driver
|
|
55
|
+
self.name = driver.find_element_by_class_name("pv-top-card-section__name").text
|
|
56
|
+
|
|
57
|
+
driver.execute_script("window.scrollTo(0, Math.ceil(document.body.scrollHeight/2));")
|
|
58
|
+
|
|
59
|
+
_ = WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.ID, "experience-section")))
|
|
60
|
+
|
|
61
|
+
# get experience
|
|
62
|
+
exp = driver.find_element_by_id("experience-section")
|
|
63
|
+
for position in exp.find_elements_by_class_name("pv-position-entity"):
|
|
64
|
+
position_title = position.find_element_by_tag_name("h3").text
|
|
65
|
+
company = position.find_element_by_class_name("pv-entity__secondary-title").text
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
times = position.find_element_by_class_name("pv-entity__date-range").text
|
|
69
|
+
from_date, to_date, duration = time_divide(times)
|
|
70
|
+
except:
|
|
71
|
+
from_date, to_date = (None, None)
|
|
72
|
+
experience = Experience( position_title = position_title , from_date = from_date , to_date = to_date)
|
|
73
|
+
experience.institution_name = company
|
|
74
|
+
self.add_experience(experience)
|
|
75
|
+
|
|
76
|
+
driver.execute_script("window.scrollTo(0, Math.ceil(document.body.scrollHeight/1.5));")
|
|
77
|
+
|
|
78
|
+
_ = WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.ID, "education-section")))
|
|
79
|
+
|
|
80
|
+
# get education
|
|
81
|
+
edu = driver.find_element_by_id("education-section")
|
|
82
|
+
for school in edu.find_elements_by_class_name("pv-profile-section__sortable-item"):
|
|
83
|
+
university = school.find_element_by_class_name("pv-entity__school-name").text
|
|
84
|
+
degree = school.find_element_by_class_name("pv-entity__degree-name").text
|
|
85
|
+
try:
|
|
86
|
+
times = school.find_element_by_class_name("pv-entity__dates").text
|
|
87
|
+
from_date, to_date, duration = time_divide(times)
|
|
88
|
+
except:
|
|
89
|
+
from_date, to_date = (None, None)
|
|
90
|
+
education = Education(from_date = from_date, to_date = to_date, degree=degree)
|
|
91
|
+
education.institution_name = university
|
|
92
|
+
self.add_education(education)
|
|
93
|
+
|
|
94
|
+
if close_on_complete:
|
|
95
|
+
driver.close()
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def scrape_not_logged_in(self, close_on_complete=True, retry_limit = 10):
|
|
99
|
+
driver = self.driver
|
|
100
|
+
retry_times = 0
|
|
101
|
+
while self.is_signed_in() and retry_times <= retry_limit:
|
|
102
|
+
page = driver.get(self.linkedin_url)
|
|
103
|
+
retry_times = retry_times + 1
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
# get name
|
|
107
|
+
self.name = driver.find_element_by_id("name").text
|
|
108
|
+
|
|
109
|
+
# get experience
|
|
110
|
+
exp = driver.find_element_by_id("experience")
|
|
111
|
+
for position in exp.find_elements_by_class_name("position"):
|
|
112
|
+
position_title = position.find_element_by_class_name("item-title").text
|
|
113
|
+
company = position.find_element_by_class_name("item-subtitle").text
|
|
114
|
+
|
|
115
|
+
try:
|
|
116
|
+
times = position.find_element_by_class_name("date-range").text
|
|
117
|
+
from_date, to_date, duration = time_divide(times)
|
|
118
|
+
except:
|
|
119
|
+
from_date, to_date = (None, None)
|
|
120
|
+
experience = Experience( position_title = position_title , from_date = from_date , to_date = to_date)
|
|
121
|
+
experience.institution_name = company
|
|
122
|
+
self.add_experience(experience)
|
|
123
|
+
|
|
124
|
+
# get education
|
|
125
|
+
edu = driver.find_element_by_id("education")
|
|
126
|
+
for school in edu.find_elements_by_class_name("school"):
|
|
127
|
+
university = school.find_element_by_class_name("item-title").text
|
|
128
|
+
degree = school.find_element_by_class_name("original").text
|
|
129
|
+
try:
|
|
130
|
+
times = school.find_element_by_class_name("date-range").text
|
|
131
|
+
from_date, to_date, duration = time_divide(times)
|
|
132
|
+
except:
|
|
133
|
+
from_date, to_date = (None, None)
|
|
134
|
+
education = Education(from_date = from_date, to_date = to_date, degree=degree)
|
|
135
|
+
education.institution_name = university
|
|
136
|
+
self.add_education(education)
|
|
137
|
+
|
|
138
|
+
# get
|
|
139
|
+
if close_on_complete:
|
|
140
|
+
driver.close()
|
|
141
|
+
|
|
142
|
+
def __repr__(self):
|
|
143
|
+
return "{name}\n\nExperience\n{exp}\n\nEducation\n{edu}".format(name = self.name, exp = self.experiences, edu = self.educations)
|
|
144
|
+
|
|
145
|
+
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
Metadata-Version: 1.1
|
|
2
|
+
Name: linkedin-scraper
|
|
3
|
+
Version: 2.1.0
|
|
4
|
+
Summary: Scrapes user data from Linkedin
|
|
5
|
+
Home-page: https://github.com/joeyism/linkedin_scraper
|
|
6
|
+
Author: Joey Sham
|
|
7
|
+
Author-email: sham.joey@gmail.com
|
|
8
|
+
License: UNKNOWN
|
|
9
|
+
Download-URL: https://github.com/joeyism/linkedin_scraper/dist/2.1.0.tar.gz
|
|
10
|
+
Description-Content-Type: UNKNOWN
|
|
11
|
+
Description: UNKNOWN
|
|
12
|
+
Keywords: linkedin,scraping,scraper
|
|
13
|
+
Platform: UNKNOWN
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
README.rst
|
|
2
|
+
setup.cfg
|
|
3
|
+
setup.py
|
|
4
|
+
linkedin_scraper/__init__.py
|
|
5
|
+
linkedin_scraper/company.py
|
|
6
|
+
linkedin_scraper/functions.py
|
|
7
|
+
linkedin_scraper/objects.py
|
|
8
|
+
linkedin_scraper/person.py
|
|
9
|
+
linkedin_scraper/scraper.py
|
|
10
|
+
linkedin_scraper.egg-info/PKG-INFO
|
|
11
|
+
linkedin_scraper.egg-info/SOURCES.txt
|
|
12
|
+
linkedin_scraper.egg-info/dependency_links.txt
|
|
13
|
+
linkedin_scraper.egg-info/requires.txt
|
|
14
|
+
linkedin_scraper.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
linkedin_scraper
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
from os.path import dirname, basename, isfile
|
|
2
|
+
from .person import Person
|
|
3
|
+
from .objects import Institution, Experience, Education
|
|
4
|
+
from .company import Company
|
|
5
|
+
|
|
6
|
+
import glob
|
|
7
|
+
modules = glob.glob(dirname(__file__)+"/*.py")
|
|
8
|
+
__all__ = [ basename(f)[:-3] for f in modules if isfile(f) and not f.endswith('__init__.py')]
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
from lxml import html
|
|
3
|
+
from selenium import webdriver
|
|
4
|
+
from selenium.webdriver.common.by import By
|
|
5
|
+
from selenium.webdriver.support.ui import WebDriverWait
|
|
6
|
+
from selenium.webdriver.support import expected_conditions as EC
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
class CompanySummary(object):
|
|
10
|
+
linkedin_url = None
|
|
11
|
+
name = None
|
|
12
|
+
followers = None
|
|
13
|
+
|
|
14
|
+
def __init__(self, linkedin_url = None, name = None, followers = None):
|
|
15
|
+
self.linkedin_url = linkedin_url
|
|
16
|
+
self.name = name
|
|
17
|
+
self.followers = followers
|
|
18
|
+
|
|
19
|
+
def __repr__(self):
|
|
20
|
+
if self.followers == None:
|
|
21
|
+
return """ {name} """.format(name = self.name)
|
|
22
|
+
else:
|
|
23
|
+
return """ {name} {followers} """.format(name = self.name, followers = self.followers)
|
|
24
|
+
|
|
25
|
+
class Company(object):
|
|
26
|
+
linkedin_url = None
|
|
27
|
+
name = None
|
|
28
|
+
about_us =None
|
|
29
|
+
website = None
|
|
30
|
+
headquarters = None
|
|
31
|
+
founded = None
|
|
32
|
+
company_type = None
|
|
33
|
+
company_size = None
|
|
34
|
+
specialties = None
|
|
35
|
+
showcase_pages =[]
|
|
36
|
+
affiliated_companies = []
|
|
37
|
+
driver = None
|
|
38
|
+
|
|
39
|
+
def __init__(self, linkedin_url = None, name = None, about_us =None, website = None, headquarters = None, founded = None, company_type = None, company_size = None, specialties = None, showcase_pages =[], affiliated_companies = [], driver = None, scrape = True):
|
|
40
|
+
self.linkedin_url = linkedin_url
|
|
41
|
+
self.name = name
|
|
42
|
+
self.about_us = about_us
|
|
43
|
+
self.website = website
|
|
44
|
+
self.headquarters = headquarters
|
|
45
|
+
self.founded = founded
|
|
46
|
+
self.company_type = company_type
|
|
47
|
+
self.company_size = company_size
|
|
48
|
+
self.specialties = specialties
|
|
49
|
+
self.showcase_pages = showcase_pages
|
|
50
|
+
self.affiliated_companies = affiliated_companies
|
|
51
|
+
|
|
52
|
+
if driver is None:
|
|
53
|
+
try:
|
|
54
|
+
if os.getenv("CHROMEDRIVER") == None:
|
|
55
|
+
driver_path = os.path.join(os.path.dirname(__file__), 'drivers/chromedriver')
|
|
56
|
+
else:
|
|
57
|
+
driver_path = os.getenv("CHROMEDRIVER")
|
|
58
|
+
|
|
59
|
+
driver = webdriver.Chrome(driver_path)
|
|
60
|
+
except:
|
|
61
|
+
driver = webdriver.Chrome()
|
|
62
|
+
|
|
63
|
+
driver.get(linkedin_url)
|
|
64
|
+
self.driver = driver
|
|
65
|
+
|
|
66
|
+
if scrape:
|
|
67
|
+
self.scrape()
|
|
68
|
+
|
|
69
|
+
def __get_text_under_subtitle(self, elem):
|
|
70
|
+
return "\n".join(elem.text.split("\n")[1:])
|
|
71
|
+
|
|
72
|
+
def __get_text_under_subtitle_by_class(self, driver, class_name):
|
|
73
|
+
return self.__get_text_under_subtitle(driver.find_element_by_class_name(class_name))
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def scrape(self, close_on_complete = True):
|
|
77
|
+
driver = self.driver
|
|
78
|
+
page = driver.get(self.linkedin_url)
|
|
79
|
+
|
|
80
|
+
self.name = driver.find_element_by_class_name("name").text
|
|
81
|
+
|
|
82
|
+
self.about_us = driver.find_element_by_class_name("basic-info-description").text
|
|
83
|
+
self.specialties = self.__get_text_under_subtitle_by_class(driver, "specialties")
|
|
84
|
+
self.website = self.__get_text_under_subtitle_by_class(driver, "website")
|
|
85
|
+
self.headquarters = driver.find_element_by_class_name("adr").text
|
|
86
|
+
self.industry = driver.find_element_by_class_name("industry").text
|
|
87
|
+
self.company_size = driver.find_element_by_class_name("company-size").text
|
|
88
|
+
self.company_type = self.__get_text_under_subtitle_by_class(driver, "type")
|
|
89
|
+
self.founded = self.__get_text_under_subtitle_by_class(driver, "founded")
|
|
90
|
+
|
|
91
|
+
# get showcase
|
|
92
|
+
try:
|
|
93
|
+
driver.find_element_by_id("view-other-showcase-pages-dialog").click()
|
|
94
|
+
WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.ID, 'dialog')))
|
|
95
|
+
|
|
96
|
+
showcase_pages = driver.find_elements_by_class_name("company-showcase-pages")[1]
|
|
97
|
+
for showcase_company in showcase_pages.find_elements_by_tag_name("li"):
|
|
98
|
+
name_elem = showcase_company.find_element_by_class_name("name")
|
|
99
|
+
companySummary = CompanySummary(
|
|
100
|
+
linkedin_url = name_elem.find_element_by_tag_name("a").get_attribute("href"),
|
|
101
|
+
name = name_elem.text,
|
|
102
|
+
followers = showcase_company.text.split("\n")[1]
|
|
103
|
+
)
|
|
104
|
+
self.showcase_pages.append(companySummary)
|
|
105
|
+
driver.find_element_by_class_name("dialog-close").click()
|
|
106
|
+
except:
|
|
107
|
+
pass
|
|
108
|
+
|
|
109
|
+
# affiliated company
|
|
110
|
+
try:
|
|
111
|
+
affiliated_pages = driver.find_element_by_class_name("affiliated-companies")
|
|
112
|
+
for i, affiliated_page in enumerate(affiliated_pages.find_elements_by_class_name("affiliated-company-name")):
|
|
113
|
+
if i % 3 == 0:
|
|
114
|
+
affiliated_pages.find_element_by_class_name("carousel-control-next").click()
|
|
115
|
+
|
|
116
|
+
companySummary = CompanySummary(
|
|
117
|
+
linkedin_url = affiliated_page.find_element_by_tag_name("a").get_attribute("href"),
|
|
118
|
+
name = affiliated_page.text
|
|
119
|
+
)
|
|
120
|
+
self.affiliated_companies.append(companySummary)
|
|
121
|
+
except:
|
|
122
|
+
pass
|
|
123
|
+
|
|
124
|
+
if close_on_complete:
|
|
125
|
+
driver.close()
|
|
126
|
+
|
|
127
|
+
def __repr__(self):
|
|
128
|
+
return """
|
|
129
|
+
{name}
|
|
130
|
+
|
|
131
|
+
{about_us}
|
|
132
|
+
|
|
133
|
+
Specialties: {specialties}
|
|
134
|
+
|
|
135
|
+
Website: {website}
|
|
136
|
+
Industry: {industry}
|
|
137
|
+
Type: {company_type}
|
|
138
|
+
Headquarters: {headquarters}
|
|
139
|
+
Company Size: {company_size}
|
|
140
|
+
Founded: {founded}
|
|
141
|
+
|
|
142
|
+
Showcase Pages
|
|
143
|
+
{showcase_pages}
|
|
144
|
+
|
|
145
|
+
Affiliated Companies
|
|
146
|
+
{affiliated_companies}
|
|
147
|
+
""".format(
|
|
148
|
+
name = self.name,
|
|
149
|
+
about_us = self.about_us,
|
|
150
|
+
specialties = self.specialties,
|
|
151
|
+
website= self.website,
|
|
152
|
+
industry= self.industry,
|
|
153
|
+
company_type= self.company_type,
|
|
154
|
+
headquarters= self.headquarters,
|
|
155
|
+
company_size= self.company_size,
|
|
156
|
+
founded= self.founded,
|
|
157
|
+
showcase_pages = self.showcase_pages,
|
|
158
|
+
affiliated_companies = self.affiliated_companies
|
|
159
|
+
)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
def time_divide(string):
|
|
4
|
+
duration = re.search("\((.*?)\)", string)
|
|
5
|
+
|
|
6
|
+
if duration != None:
|
|
7
|
+
duration = duration.group(0)
|
|
8
|
+
string = string.replace(duration, "").strip()
|
|
9
|
+
else:
|
|
10
|
+
duration = "()"
|
|
11
|
+
|
|
12
|
+
times = string.split("–")
|
|
13
|
+
return (times[0].strip(), times[1].strip(), duration[1:-1])
|
|
14
|
+
|
|
15
|
+
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
|
|
2
|
+
class Institution(object):
|
|
3
|
+
institution_name = None
|
|
4
|
+
website = None
|
|
5
|
+
industry = None
|
|
6
|
+
type = None
|
|
7
|
+
headquarters = None
|
|
8
|
+
company_size = None
|
|
9
|
+
founded = None
|
|
10
|
+
|
|
11
|
+
def __init__(self, name=None, website=None, industry=None, type=None, headquarters=None, company_size=None, founded=None):
|
|
12
|
+
self.name = name
|
|
13
|
+
self.website = website
|
|
14
|
+
self.industry = industry
|
|
15
|
+
self.type = type
|
|
16
|
+
self.headquarters = headquarters
|
|
17
|
+
self.company_size = company_size
|
|
18
|
+
self.founded = founded
|
|
19
|
+
|
|
20
|
+
class Experience(Institution):
|
|
21
|
+
from_date = None
|
|
22
|
+
to_date = None
|
|
23
|
+
description = None
|
|
24
|
+
position_title = None
|
|
25
|
+
|
|
26
|
+
def __init__(self, from_date = None, to_date = None, description = None, position_title = None):
|
|
27
|
+
self.from_date = from_date
|
|
28
|
+
self.to_date = to_date
|
|
29
|
+
self.description = description
|
|
30
|
+
self.position_title = position_title
|
|
31
|
+
|
|
32
|
+
def __repr__(self):
|
|
33
|
+
return "{position_title} at {company} from {from_date} to {to_date}".format( from_date = self.from_date, to_date = self.to_date, position_title = self.position_title, company = self.institution_name)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class Education(Institution):
|
|
37
|
+
from_date = None
|
|
38
|
+
to_date = None
|
|
39
|
+
description = None
|
|
40
|
+
degree = None
|
|
41
|
+
|
|
42
|
+
def __init__(self, from_date = None, to_date = None, description = None, degree = None):
|
|
43
|
+
self.from_date = from_date
|
|
44
|
+
self.to_date = to_date
|
|
45
|
+
self.description = description
|
|
46
|
+
self.degree = degree
|
|
47
|
+
|
|
48
|
+
def __repr__(self):
|
|
49
|
+
return "{degree} at {company} from {from_date} to {to_date}".format( from_date = self.from_date, to_date = self.to_date, degree = self.degree, company = self.institution_name)
|
|
50
|
+
|
|
51
|
+
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
from lxml import html
|
|
3
|
+
from selenium import webdriver
|
|
4
|
+
from .functions import time_divide
|
|
5
|
+
import os
|
|
6
|
+
|
|
7
|
+
class Person(object):
|
|
8
|
+
name = None
|
|
9
|
+
experiences = []
|
|
10
|
+
educations = []
|
|
11
|
+
also_viewed_urls = []
|
|
12
|
+
linkedin_url = None
|
|
13
|
+
driver = None
|
|
14
|
+
|
|
15
|
+
def __init__(self, linkedin_url = None, experiences = [], educations = [], driver = None, scrape = True):
|
|
16
|
+
self.linkedin_url = linkedin_url
|
|
17
|
+
self.experiences = experiences
|
|
18
|
+
self.educations = educations
|
|
19
|
+
|
|
20
|
+
if driver is None:
|
|
21
|
+
try:
|
|
22
|
+
if os.getenv("CHROMEDRIVER") == None:
|
|
23
|
+
driver_path = os.path.join(os.path.dirname(__file__), 'drivers/chromedriver')
|
|
24
|
+
else:
|
|
25
|
+
driver_path = os.getenv("CHROMEDRIVER")
|
|
26
|
+
|
|
27
|
+
driver = webdriver.Chrome(driver_path)
|
|
28
|
+
except:
|
|
29
|
+
driver = webdriver.Chrome()
|
|
30
|
+
|
|
31
|
+
driver.get(linkedin_url)
|
|
32
|
+
self.driver = driver
|
|
33
|
+
|
|
34
|
+
if scrape:
|
|
35
|
+
self.scrape()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def add_experience(self, experience):
|
|
39
|
+
self.experiences.append(experience)
|
|
40
|
+
|
|
41
|
+
def add_education(self, education):
|
|
42
|
+
self.educations.append(education)
|
|
43
|
+
|
|
44
|
+
def scrape(self, close_on_complete=True):
|
|
45
|
+
driver = self.driver
|
|
46
|
+
page = driver.get(self.linkedin_url)
|
|
47
|
+
|
|
48
|
+
# get name
|
|
49
|
+
self.name = driver.find_element_by_id("name").text
|
|
50
|
+
|
|
51
|
+
# get experience
|
|
52
|
+
exp = driver.find_element_by_id("experience")
|
|
53
|
+
for position in exp.find_elements_by_class_name("position"):
|
|
54
|
+
position_title = position.find_element_by_class_name("item-title").text
|
|
55
|
+
company = position.find_element_by_class_name("item-subtitle").text
|
|
56
|
+
|
|
57
|
+
try:
|
|
58
|
+
times = position.find_element_by_class_name("date-range").text
|
|
59
|
+
from_date, to_date, duration = time_divide(times)
|
|
60
|
+
except:
|
|
61
|
+
from_date, to_date = (None, None)
|
|
62
|
+
experience = Experience( position_title = position_title , from_date = from_date , to_date = to_date)
|
|
63
|
+
experience.institution_name = company
|
|
64
|
+
self.add_experience(experience)
|
|
65
|
+
|
|
66
|
+
# get education
|
|
67
|
+
edu = driver.find_element_by_id("education")
|
|
68
|
+
for school in edu.find_elements_by_class_name("school"):
|
|
69
|
+
university = school.find_element_by_class_name("item-title").text
|
|
70
|
+
degree = school.find_element_by_class_name("original").text
|
|
71
|
+
try:
|
|
72
|
+
times = school.find_element_by_class_name("date-range").text
|
|
73
|
+
from_date, to_date, duration = time_divide(times)
|
|
74
|
+
except:
|
|
75
|
+
from_date, to_date = (None, None)
|
|
76
|
+
education = Education(from_date = from_date, to_date = to_date, degree=degree)
|
|
77
|
+
education.institution_name = university
|
|
78
|
+
self.add_education(education)
|
|
79
|
+
|
|
80
|
+
# get
|
|
81
|
+
if close_on_complete:
|
|
82
|
+
driver.close()
|
|
83
|
+
|
|
84
|
+
def __repr__(self):
|
|
85
|
+
return "{name}\n\nExperience\n{exp}\n\nEducation\n{edu}".format(name = self.name, exp = self.experiences, edu = self.educations)
|
|
86
|
+
|
|
87
|
+
|