python-job-scraper 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,215 @@
1
+ """Utility functions for parsing Glassdoor job data from HTML."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import re
7
+ from typing import Any
8
+
9
+ from bs4 import BeautifulSoup
10
+
11
+ from jobscraper.glassdoor.constant import BASE_URL
12
+ from jobscraper.model import Compensation, CompensationInterval, Location
13
+ from jobscraper.util import extract_emails_from_text
14
+
15
+
16
+ def get_location_id(session: Any, headers: dict[str, str], location: str) -> tuple[str, int] | None:
17
+ """Look up a Glassdoor location slug and ID via the suggest API.
18
+
19
+ Args:
20
+ session: Active HTTP session.
21
+ headers: Request headers.
22
+ location: City or region name (e.g. "Bangalore").
23
+
24
+ Returns:
25
+ ``(location_slug, location_id)`` tuple, or None if not found.
26
+ """
27
+ url = f"{BASE_URL}/findPopularLocationAjax.htm?term={location}"
28
+ try:
29
+ resp = session.get(url, headers=headers)
30
+ results = resp.json() if hasattr(resp, "json") else json.loads(resp.text)
31
+ if results:
32
+ first = results[0]
33
+ loc_id = int(first.get("locationId") or first.get("realId"))
34
+ label = first.get("label", location)
35
+ slug = re.sub(r"[^a-z0-9]+", "-", label.lower()).strip("-")
36
+ return slug, loc_id
37
+ except Exception:
38
+ pass
39
+ return None
40
+
41
+
42
+ def build_search_url(
43
+ keyword: str,
44
+ location_slug: str,
45
+ location_id: int,
46
+ page: int = 1,
47
+ ) -> str:
48
+ """Build a Glassdoor job search URL.
49
+
50
+ Uses Glassdoor's SEO URL format:
51
+ ``/Job/{loc}-{kw}-jobs-SRCH_IL.0,{L}_IC{id}_KO{L+1},{L+1+K}[_IP{page}].htm``
52
+
53
+ Args:
54
+ keyword: Job search term (e.g. "software engineer").
55
+ location_slug: Slugified location (e.g. "bengaluru-india").
56
+ location_id: Numeric Glassdoor location ID.
57
+ page: Page number (1-indexed).
58
+
59
+ Returns:
60
+ Full URL string.
61
+ """
62
+ kw_slug = re.sub(r"[^a-z0-9]+", "-", keyword.lower()).strip("-")
63
+ L = len(location_slug)
64
+ K = len(kw_slug)
65
+ page_suffix = f"_IP{page}" if page > 1 else ""
66
+ path = (
67
+ f"/Job/{location_slug}-{kw_slug}-jobs-SRCH_"
68
+ f"IL.0,{L}_IC{location_id}_KO{L + 1},{L + 1 + K}"
69
+ f"{page_suffix}.htm"
70
+ )
71
+ return BASE_URL + path
72
+
73
+
74
+ def parse_html_jobs(html: str) -> list[dict[str, Any]]:
75
+ """Extract job listing dicts from Glassdoor's RSC-streamed HTML page.
76
+
77
+ Glassdoor embeds job data as JSON inside ``self.__next_f.push([1, "..."])``
78
+ script tags. This function decodes those chunks and extracts the
79
+ ``jobListings`` array.
80
+
81
+ Args:
82
+ html: Raw HTML from a Glassdoor search results page.
83
+
84
+ Returns:
85
+ List of raw jobview dicts, or empty list on failure.
86
+ """
87
+ scripts = re.findall(r"<script[^>]*>(.*?)</script>", html, re.DOTALL)
88
+
89
+ combined = []
90
+ for s in scripts:
91
+ m = re.search(r'self\.__next_f\.push\(\[1,"(.*)"\]\)', s, re.DOTALL)
92
+ if m:
93
+ try:
94
+ decoded = json.loads('"' + m.group(1) + '"')
95
+ combined.append(decoded)
96
+ except Exception:
97
+ pass
98
+
99
+ text = "".join(combined)
100
+ if not text:
101
+ return []
102
+
103
+ # Find "jobListings":[{"jobview":...}] array and extract via depth counting
104
+ marker = '"jobListings":[{"jobview"'
105
+ start = text.find(marker)
106
+ if start == -1:
107
+ return []
108
+
109
+ array_start = start + len('"jobListings":')
110
+ depth = 0
111
+ i = array_start
112
+ while i < len(text):
113
+ if text[i] == "[":
114
+ depth += 1
115
+ elif text[i] == "]":
116
+ depth -= 1
117
+ if depth == 0:
118
+ break
119
+ i += 1
120
+
121
+ try:
122
+ listings = json.loads(text[array_start : i + 1])
123
+ return [item["jobview"] for item in listings if "jobview" in item]
124
+ except Exception:
125
+ return []
126
+
127
+
128
+ def parse_compensation(header: dict[str, Any]) -> Compensation | None:
129
+ """Extract compensation from a Glassdoor job header dict.
130
+
131
+ Reads ``payPeriodAdjustedPay`` (p10/p90 range) and ``payPeriod`` interval.
132
+
133
+ Args:
134
+ header: The ``header`` sub-dict from a Glassdoor jobview.
135
+
136
+ Returns:
137
+ Compensation model or None if no pay data is present.
138
+ """
139
+ pay = header.get("payPeriodAdjustedPay") or {}
140
+ min_val = pay.get("p10") if isinstance(pay, dict) else None
141
+ max_val = pay.get("p90") if isinstance(pay, dict) else None
142
+ if min_val is None and max_val is None:
143
+ return None
144
+
145
+ period = (header.get("payPeriod") or "").lower()
146
+ interval_map: dict[str, CompensationInterval] = {
147
+ "annual": CompensationInterval.YEARLY,
148
+ "yearly": CompensationInterval.YEARLY,
149
+ "monthly": CompensationInterval.MONTHLY,
150
+ "weekly": CompensationInterval.WEEKLY,
151
+ "daily": CompensationInterval.DAILY,
152
+ "hourly": CompensationInterval.HOURLY,
153
+ }
154
+ interval = interval_map.get(period)
155
+
156
+ try:
157
+ return Compensation(
158
+ interval=interval,
159
+ min_amount=float(min_val) if min_val is not None else None,
160
+ max_amount=float(max_val) if max_val is not None else None,
161
+ currency=header.get("payCurrency") or "INR",
162
+ )
163
+ except (ValueError, TypeError):
164
+ return None
165
+
166
+
167
+ def parse_location(raw: str) -> Location:
168
+ """Parse a Glassdoor location string into a Location model.
169
+
170
+ Handles ``"Bengaluru, Karnataka"``, ``"Remote in Mumbai, Maharashtra"``,
171
+ or a bare city name.
172
+
173
+ Args:
174
+ raw: Raw location string from Glassdoor header.
175
+
176
+ Returns:
177
+ Populated Location model.
178
+ """
179
+ if not raw:
180
+ return Location()
181
+
182
+ clean = re.sub(r"^(?:remote\s+in\s+)", "", raw.strip(), flags=re.IGNORECASE)
183
+ clean = re.sub(r"\s*\d{5,6}\s*$", "", clean).strip()
184
+ parts = [p.strip() for p in clean.split(",") if p.strip()]
185
+
186
+ if len(parts) >= 2:
187
+ return Location(city=parts[0], state=parts[1])
188
+ elif len(parts) == 1:
189
+ return Location(city=parts[0])
190
+ return Location()
191
+
192
+
193
+ def get_job_detail_url(listing_id: str) -> str:
194
+ """Build the full URL for a Glassdoor job detail page.
195
+
196
+ Args:
197
+ listing_id: The Glassdoor listing ID.
198
+
199
+ Returns:
200
+ Full URL string.
201
+ """
202
+ return f"{BASE_URL}/job-listing/jl={listing_id}"
203
+
204
+
205
+ def extract_emails(html: str) -> list[str]:
206
+ """Extract email addresses from Glassdoor job detail HTML.
207
+
208
+ Args:
209
+ html: Raw HTML string from a job detail page.
210
+
211
+ Returns:
212
+ List of unique email addresses found.
213
+ """
214
+ soup = BeautifulSoup(html, "lxml")
215
+ return list(set(extract_emails_from_text(soup.get_text())))
@@ -0,0 +1,331 @@
1
+ """Indeed scraper implementation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import random
6
+ import time
7
+ from datetime import date, datetime
8
+ from typing import Any
9
+
10
+ from jobscraper.exception import IndeedException
11
+ from jobscraper.indeed.constant import INDEED_HEADERS, JOB_TYPE_MAP, JOBS_SEARCH_URL
12
+ from jobscraper.indeed.util import (
13
+ extract_emails,
14
+ get_job_detail_url,
15
+ parse_compensation,
16
+ parse_location,
17
+ parse_mosaic_json,
18
+ )
19
+ from jobscraper.model import JobPost, JobResponse, Scraper, ScraperInput, Site
20
+ from jobscraper.util import create_logger, create_session, get_company_website, markdown_converter
21
+
22
+ logger = create_logger("indeed")
23
+
24
+ _PAGE_SIZE = 15 # Indeed typically returns up to 15 results per page
25
+
26
+
27
+ class IndeedScraper(Scraper):
28
+ """Scraper for in.indeed.com job listings.
29
+
30
+ Uses TLS fingerprinting (chrome_120) to bypass anti-bot measures and
31
+ extracts job data from the embedded mosaic-data JSON blob.
32
+ """
33
+
34
+ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
35
+ """Fetch job listings from Indeed and return as a JobResponse.
36
+
37
+ Args:
38
+ scraper_input: Validated scraper configuration including search
39
+ term, location, pagination, and session options.
40
+
41
+ Returns:
42
+ JobResponse containing all collected JobPost objects.
43
+
44
+ Raises:
45
+ IndeedException: On unrecoverable HTTP or parsing errors.
46
+ """
47
+ session = create_session(
48
+ proxies=scraper_input.proxies,
49
+ ca_cert=scraper_input.ca_cert,
50
+ is_tls=True,
51
+ )
52
+
53
+ # Override User-Agent if provided
54
+ headers = dict(INDEED_HEADERS)
55
+ if scraper_input.user_agent:
56
+ headers["User-Agent"] = scraper_input.user_agent
57
+
58
+ # Warm up the session to acquire cookies (avoids 403 on search)
59
+ base_url = f"https://{scraper_input.country_indeed.value}.indeed.com/"
60
+ try:
61
+ session.get(base_url, headers=headers)
62
+ time.sleep(random.uniform(2.0, 4.0))
63
+ except Exception:
64
+ pass
65
+
66
+ jobs: list[JobPost] = []
67
+ start = scraper_input.offset
68
+
69
+ while len(jobs) < scraper_input.results_wanted:
70
+ params = self._build_params(scraper_input, start)
71
+ search_url = JOBS_SEARCH_URL.format(
72
+ country=scraper_input.country_indeed.value
73
+ )
74
+
75
+ try:
76
+ response = session.get(search_url, headers=headers, params=params)
77
+ except Exception as exc:
78
+ raise IndeedException(
79
+ f"Failed to fetch Indeed search page: {exc}"
80
+ ) from exc
81
+
82
+ status = getattr(response, "status_code", None)
83
+ if isinstance(status, int) and status >= 400:
84
+ raise IndeedException(
85
+ f"Indeed returned HTTP {status} for search request. "
86
+ "Bot detection may be blocking requests."
87
+ )
88
+
89
+ html = (
90
+ response.text
91
+ if hasattr(response, "text")
92
+ else response.content.decode()
93
+ )
94
+ job_dicts = parse_mosaic_json(html)
95
+ if not job_dicts:
96
+ logger.warning(
97
+ "No jobs parsed from Indeed response (start=%d). "
98
+ "Page may be a bot-check or the structure may have changed.",
99
+ start,
100
+ )
101
+
102
+ if not job_dicts:
103
+ logger.info("No job dicts found on page (start=%d); stopping.", start)
104
+ break
105
+
106
+ for raw in job_dicts:
107
+ if len(jobs) >= scraper_input.results_wanted:
108
+ break
109
+ job = self._build_job_post(raw, scraper_input, session, headers)
110
+ if job:
111
+ jobs.append(job)
112
+
113
+ # Exit early if Indeed returned a partial page (last page)
114
+ if len(job_dicts) < _PAGE_SIZE:
115
+ break
116
+
117
+ start += _PAGE_SIZE
118
+ time.sleep(random.uniform(0.5, 2.5))
119
+
120
+ return JobResponse(jobs=jobs)
121
+
122
+ # ------------------------------------------------------------------
123
+ # Private helpers
124
+ # ------------------------------------------------------------------
125
+
126
+ def _build_params(self, scraper_input: ScraperInput, start: int) -> dict[str, Any]:
127
+ """Build the query parameters for an Indeed search URL."""
128
+ params: dict[str, Any] = {
129
+ "q": scraper_input.search_term,
130
+ "start": start,
131
+ }
132
+ if scraper_input.location:
133
+ params["l"] = scraper_input.location
134
+ if scraper_input.distance is not None:
135
+ params["radius"] = scraper_input.distance
136
+ if scraper_input.hours_old is not None:
137
+ params["fromage"] = scraper_input.hours_old // 24 or 1
138
+ return params
139
+
140
+ def _build_job_post(
141
+ self,
142
+ raw: dict[str, Any],
143
+ scraper_input: ScraperInput,
144
+ session: Any,
145
+ headers: dict[str, str],
146
+ ) -> JobPost | None:
147
+ """Convert a raw Indeed job dict to a JobPost.
148
+
149
+ Uses field-level try/except so partial data never crashes the scraper.
150
+ Logs warnings for missing optional fields.
151
+ """
152
+ try:
153
+ job_key = raw.get("jobkey") or raw.get("jobKey") or ""
154
+ if not job_key:
155
+ logger.warning("Job dict missing jobkey, skipping: %s", raw)
156
+ return None
157
+
158
+ job_url = get_job_detail_url(job_key, scraper_input.country_indeed)
159
+
160
+ # Title
161
+ try:
162
+ title = raw["title"]
163
+ except KeyError:
164
+ logger.warning("Job %s missing title", job_key)
165
+ return None
166
+
167
+ # Company
168
+ try:
169
+ company = raw.get("company") or raw.get("companyName")
170
+ except Exception:
171
+ company = None
172
+ logger.warning("Job %s: could not parse company", job_key)
173
+
174
+ # Location
175
+ try:
176
+ raw_loc = raw.get("formattedLocation") or raw.get("location") or ""
177
+ location = parse_location(raw_loc) if raw_loc else None
178
+ except Exception:
179
+ location = None
180
+ logger.warning("Job %s: could not parse location", job_key)
181
+
182
+ # Date posted
183
+ try:
184
+ ts = raw.get("pubDate") or raw.get("datePosted")
185
+ if ts:
186
+ date_posted = datetime.fromtimestamp(int(ts) / 1000).date()
187
+ else:
188
+ date_posted = None
189
+ except Exception:
190
+ date_posted = None
191
+ logger.warning("Job %s: could not parse date_posted", job_key)
192
+
193
+ # Job type
194
+ try:
195
+ raw_type = raw.get("jobTypes") or []
196
+ job_type = [JOB_TYPE_MAP["fulltime"]]
197
+ if isinstance(raw_type, list) and raw_type:
198
+ job_type = [
199
+ JOB_TYPE_MAP[t.lower()]
200
+ for t in raw_type
201
+ if t.lower() in JOB_TYPE_MAP
202
+ ] or None
203
+ except Exception:
204
+ job_type = None
205
+ logger.warning("Job %s: could not parse job_type", job_key)
206
+
207
+ # Compensation
208
+ try:
209
+ compensation = parse_compensation(raw)
210
+ except Exception:
211
+ compensation = None
212
+ logger.warning("Job %s: could not parse compensation", job_key)
213
+
214
+ # Remote
215
+ try:
216
+ is_remote = bool(raw.get("remoteLocation") or raw.get("remote"))
217
+ except Exception:
218
+ is_remote = None
219
+
220
+ # Indeed Apply vs external apply
221
+ # indeedApplyEnabled=True means apply happens on Indeed itself.
222
+ # thirdPartyApplyUrl is the external ATS link when present.
223
+ try:
224
+ indeed_apply_flag = raw.get("indeedApplyEnabled")
225
+ third_party_url = raw.get("thirdPartyApplyUrl") or None
226
+ if indeed_apply_flag is not None:
227
+ is_indeed_apply: bool | None = bool(indeed_apply_flag)
228
+ elif third_party_url:
229
+ is_indeed_apply = False
230
+ else:
231
+ is_indeed_apply = None
232
+ except Exception:
233
+ is_indeed_apply = None
234
+ third_party_url = None
235
+
236
+ # Description & emails
237
+ description: str | None = None
238
+ emails: list[str] | None = None
239
+ # Seed job_url_direct from mosaic data if available
240
+ job_url_direct: str | None = third_party_url if not is_indeed_apply else None
241
+
242
+ if scraper_input.fetch_full_description:
243
+ try:
244
+ detail_resp = session.get(job_url, headers=headers)
245
+ detail_html = (
246
+ detail_resp.text
247
+ if hasattr(detail_resp, "text")
248
+ else detail_resp.content.decode()
249
+ )
250
+ from bs4 import BeautifulSoup
251
+
252
+ soup = BeautifulSoup(detail_html, "lxml")
253
+ desc_tag = soup.find("div", {"id": "jobDescriptionText"})
254
+ if desc_tag:
255
+ raw_html = str(desc_tag)
256
+ if scraper_input.description_format == "markdown":
257
+ description = markdown_converter(raw_html)
258
+ else:
259
+ description = raw_html
260
+ emails = extract_emails(detail_html) or None
261
+ # Try to get the direct apply URL
262
+ apply_tag = soup.find("a", {"id": "applyButton"}) or soup.find(
263
+ "a", {"data-testid": "applyButton"}
264
+ )
265
+ if apply_tag and apply_tag.get("href"):
266
+ job_url_direct = str(apply_tag["href"])
267
+ time.sleep(random.uniform(0.5, 2.5))
268
+ except Exception as exc:
269
+ logger.warning(
270
+ "Job %s: failed to fetch detail page: %s", job_key, exc
271
+ )
272
+
273
+ # Company URL / logo + scrape emails from company site
274
+ try:
275
+ raw_company_url = raw.get("companyOverviewLink") or ""
276
+ if raw_company_url.startswith("https"):
277
+ company_url = raw_company_url
278
+ elif company:
279
+ company_url = get_company_website(company)
280
+ else:
281
+ company_url = None
282
+
283
+ if company_url:
284
+ try:
285
+ co_resp = session.get(company_url, headers=headers)
286
+ co_html = (
287
+ co_resp.text
288
+ if hasattr(co_resp, "text")
289
+ else co_resp.content.decode()
290
+ )
291
+ co_emails = extract_emails(co_html) or []
292
+ if co_emails:
293
+ existing = set(emails or [])
294
+ emails = list(existing | set(co_emails)) or None
295
+ except Exception as exc:
296
+ logger.warning(
297
+ "Job %s: failed to scrape emails from company site %s: %s",
298
+ job_key, company_url, exc,
299
+ )
300
+ except Exception:
301
+ company_url = None
302
+
303
+ try:
304
+ company_logo = (
305
+ raw.get("companyBrandingAttributes", {}).get("logoUrl") or None
306
+ )
307
+ except Exception:
308
+ company_logo = None
309
+
310
+ return JobPost(
311
+ id=job_key,
312
+ site=Site.INDEED,
313
+ job_url=job_url,
314
+ job_url_direct=job_url_direct,
315
+ title=title,
316
+ company=company,
317
+ location=location,
318
+ date_posted=date_posted,
319
+ job_type=job_type,
320
+ compensation=compensation,
321
+ is_remote=is_remote,
322
+ is_indeed_apply=is_indeed_apply,
323
+ description=description,
324
+ emails=emails,
325
+ company_url=company_url,
326
+ company_logo=company_logo,
327
+ )
328
+
329
+ except Exception as exc:
330
+ logger.warning("Unexpected error building JobPost: %s", exc)
331
+ return None
@@ -0,0 +1,38 @@
1
+ """Constants for the Indeed scraper: headers, URLs, and job type mappings."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from jobscraper.model import JobType
6
+
7
+ INDEED_HEADERS: dict[str, str] = {
8
+ "User-Agent": (
9
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
10
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
11
+ "Chrome/120.0.0.0 Safari/537.36"
12
+ ),
13
+ "Accept-Language": "en-US,en;q=0.9",
14
+ "Accept": (
15
+ "text/html,application/xhtml+xml,application/xml;"
16
+ "q=0.9,image/avif,image/webp,*/*;q=0.8"
17
+ ),
18
+ "Referer": "https://in.indeed.com/",
19
+ "Accept-Encoding": "gzip, deflate, br",
20
+ "Connection": "keep-alive",
21
+ "Upgrade-Insecure-Requests": "1",
22
+ }
23
+
24
+ BASE_URL: str = "https://{country}.indeed.com"
25
+ JOBS_SEARCH_URL: str = BASE_URL + "/jobs"
26
+
27
+ JOB_TYPE_MAP: dict[str, JobType] = {
28
+ "fulltime": JobType.FULL_TIME,
29
+ "full-time": JobType.FULL_TIME,
30
+ "parttime": JobType.PART_TIME,
31
+ "part-time": JobType.PART_TIME,
32
+ "contract": JobType.CONTRACT,
33
+ "contractor": JobType.CONTRACT,
34
+ "temporary": JobType.TEMPORARY,
35
+ "temp": JobType.TEMPORARY,
36
+ "internship": JobType.INTERNSHIP,
37
+ "intern": JobType.INTERNSHIP,
38
+ }