PyPI - python-job-scraper - Versions diffs - 0.3.0__py3-none-any.whl - Mend

python-job-scraper 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

jobscraper/__init__.py +302 -0
jobscraper/exception.py +32 -0
jobscraper/glassdoor/__init__.py +309 -0
jobscraper/glassdoor/constant.py +33 -0
jobscraper/glassdoor/util.py +215 -0
jobscraper/indeed/__init__.py +331 -0
jobscraper/indeed/constant.py +38 -0
jobscraper/indeed/util.py +157 -0
jobscraper/linkedin/__init__.py +5 -0
jobscraper/linkedin/_scraper.py +283 -0
jobscraper/linkedin/constant.py +60 -0
jobscraper/linkedin/util.py +331 -0
jobscraper/model.py +144 -0
jobscraper/util.py +500 -0
python_job_scraper-0.3.0.dist-info/METADATA +221 -0
python_job_scraper-0.3.0.dist-info/RECORD +19 -0
python_job_scraper-0.3.0.dist-info/WHEEL +5 -0
python_job_scraper-0.3.0.dist-info/licenses/LICENSE +21 -0
python_job_scraper-0.3.0.dist-info/top_level.txt +1 -0

jobscraper/glassdoor/util.py ADDED Viewed

@@ -0,0 +1,215 @@
+"""Utility functions for parsing Glassdoor job data from HTML."""
+from __future__ import annotations
+import json
+import re
+from typing import Any
+from bs4 import BeautifulSoup
+from jobscraper.glassdoor.constant import BASE_URL
+from jobscraper.model import Compensation, CompensationInterval, Location
+from jobscraper.util import extract_emails_from_text
+def get_location_id(session: Any, headers: dict[str, str], location: str) -> tuple[str, int] | None:
+    """Look up a Glassdoor location slug and ID via the suggest API.
+    Args:
+        session: Active HTTP session.
+        headers: Request headers.
+        location: City or region name (e.g. "Bangalore").
+    Returns:
+        ``(location_slug, location_id)`` tuple, or None if not found.
+    """
+    url = f"{BASE_URL}/findPopularLocationAjax.htm?term={location}"
+    try:
+        resp = session.get(url, headers=headers)
+        results = resp.json() if hasattr(resp, "json") else json.loads(resp.text)
+        if results:
+            first = results[0]
+            loc_id = int(first.get("locationId") or first.get("realId"))
+            label = first.get("label", location)
+            slug = re.sub(r"[^a-z0-9]+", "-", label.lower()).strip("-")
+            return slug, loc_id
+    except Exception:
+        pass
+    return None
+def build_search_url(
+    keyword: str,
+    location_slug: str,
+    location_id: int,
+    page: int = 1,
+) -> str:
+    """Build a Glassdoor job search URL.
+    Uses Glassdoor's SEO URL format:
+    ``/Job/{loc}-{kw}-jobs-SRCH_IL.0,{L}_IC{id}_KO{L+1},{L+1+K}[_IP{page}].htm``
+    Args:
+        keyword: Job search term (e.g. "software engineer").
+        location_slug: Slugified location (e.g. "bengaluru-india").
+        location_id: Numeric Glassdoor location ID.
+        page: Page number (1-indexed).
+    Returns:
+        Full URL string.
+    """
+    kw_slug = re.sub(r"[^a-z0-9]+", "-", keyword.lower()).strip("-")
+    L = len(location_slug)
+    K = len(kw_slug)
+    page_suffix = f"_IP{page}" if page > 1 else ""
+    path = (
+        f"/Job/{location_slug}-{kw_slug}-jobs-SRCH_"
+        f"IL.0,{L}_IC{location_id}_KO{L + 1},{L + 1 + K}"
+        f"{page_suffix}.htm"
+    )
+    return BASE_URL + path
+def parse_html_jobs(html: str) -> list[dict[str, Any]]:
+    """Extract job listing dicts from Glassdoor's RSC-streamed HTML page.
+    Glassdoor embeds job data as JSON inside ``self.__next_f.push([1, "..."])``
+    script tags. This function decodes those chunks and extracts the
+    ``jobListings`` array.
+    Args:
+        html: Raw HTML from a Glassdoor search results page.
+    Returns:
+        List of raw jobview dicts, or empty list on failure.
+    """
+    scripts = re.findall(r"<script[^>]*>(.*?)</script>", html, re.DOTALL)
+    combined = []
+    for s in scripts:
+        m = re.search(r'self\.__next_f\.push\(\[1,"(.*)"\]\)', s, re.DOTALL)
+        if m:
+            try:
+                decoded = json.loads('"' + m.group(1) + '"')
+                combined.append(decoded)
+            except Exception:
+                pass
+    text = "".join(combined)
+    if not text:
+        return []
+    # Find "jobListings":[{"jobview":...}] array and extract via depth counting
+    marker = '"jobListings":[{"jobview"'
+    start = text.find(marker)
+    if start == -1:
+        return []
+    array_start = start + len('"jobListings":')
+    depth = 0
+    i = array_start
+    while i < len(text):
+        if text[i] == "[":
+            depth += 1
+        elif text[i] == "]":
+            depth -= 1
+            if depth == 0:
+                break
+        i += 1
+    try:
+        listings = json.loads(text[array_start : i + 1])
+        return [item["jobview"] for item in listings if "jobview" in item]
+    except Exception:
+        return []
+def parse_compensation(header: dict[str, Any]) -> Compensation | None:
+    """Extract compensation from a Glassdoor job header dict.
+    Reads ``payPeriodAdjustedPay`` (p10/p90 range) and ``payPeriod`` interval.
+    Args:
+        header: The ``header`` sub-dict from a Glassdoor jobview.
+    Returns:
+        Compensation model or None if no pay data is present.
+    """
+    pay = header.get("payPeriodAdjustedPay") or {}
+    min_val = pay.get("p10") if isinstance(pay, dict) else None
+    max_val = pay.get("p90") if isinstance(pay, dict) else None
+    if min_val is None and max_val is None:
+        return None
+    period = (header.get("payPeriod") or "").lower()
+    interval_map: dict[str, CompensationInterval] = {
+        "annual": CompensationInterval.YEARLY,
+        "yearly": CompensationInterval.YEARLY,
+        "monthly": CompensationInterval.MONTHLY,
+        "weekly": CompensationInterval.WEEKLY,
+        "daily": CompensationInterval.DAILY,
+        "hourly": CompensationInterval.HOURLY,
+    }
+    interval = interval_map.get(period)
+    try:
+        return Compensation(
+            interval=interval,
+            min_amount=float(min_val) if min_val is not None else None,
+            max_amount=float(max_val) if max_val is not None else None,
+            currency=header.get("payCurrency") or "INR",
+        )
+    except (ValueError, TypeError):
+        return None
+def parse_location(raw: str) -> Location:
+    """Parse a Glassdoor location string into a Location model.
+    Handles ``"Bengaluru, Karnataka"``, ``"Remote in Mumbai, Maharashtra"``,
+    or a bare city name.
+    Args:
+        raw: Raw location string from Glassdoor header.
+    Returns:
+        Populated Location model.
+    """
+    if not raw:
+        return Location()
+    clean = re.sub(r"^(?:remote\s+in\s+)", "", raw.strip(), flags=re.IGNORECASE)
+    clean = re.sub(r"\s*\d{5,6}\s*$", "", clean).strip()
+    parts = [p.strip() for p in clean.split(",") if p.strip()]
+    if len(parts) >= 2:
+        return Location(city=parts[0], state=parts[1])
+    elif len(parts) == 1:
+        return Location(city=parts[0])
+    return Location()
+def get_job_detail_url(listing_id: str) -> str:
+    """Build the full URL for a Glassdoor job detail page.
+    Args:
+        listing_id: The Glassdoor listing ID.
+    Returns:
+        Full URL string.
+    """
+    return f"{BASE_URL}/job-listing/jl={listing_id}"
+def extract_emails(html: str) -> list[str]:
+    """Extract email addresses from Glassdoor job detail HTML.
+    Args:
+        html: Raw HTML string from a job detail page.
+    Returns:
+        List of unique email addresses found.
+    """
+    soup = BeautifulSoup(html, "lxml")
+    return list(set(extract_emails_from_text(soup.get_text())))

jobscraper/indeed/__init__.py ADDED Viewed

@@ -0,0 +1,331 @@
+"""Indeed scraper implementation."""
+from __future__ import annotations
+import random
+import time
+from datetime import date, datetime
+from typing import Any
+from jobscraper.exception import IndeedException
+from jobscraper.indeed.constant import INDEED_HEADERS, JOB_TYPE_MAP, JOBS_SEARCH_URL
+from jobscraper.indeed.util import (
+    extract_emails,
+    get_job_detail_url,
+    parse_compensation,
+    parse_location,
+    parse_mosaic_json,
+)
+from jobscraper.model import JobPost, JobResponse, Scraper, ScraperInput, Site
+from jobscraper.util import create_logger, create_session, get_company_website, markdown_converter
+logger = create_logger("indeed")
+_PAGE_SIZE = 15  # Indeed typically returns up to 15 results per page
+class IndeedScraper(Scraper):
+    """Scraper for in.indeed.com job listings.
+    Uses TLS fingerprinting (chrome_120) to bypass anti-bot measures and
+    extracts job data from the embedded mosaic-data JSON blob.
+    """
+    def scrape(self, scraper_input: ScraperInput) -> JobResponse:
+        """Fetch job listings from Indeed and return as a JobResponse.
+        Args:
+            scraper_input: Validated scraper configuration including search
+                term, location, pagination, and session options.
+        Returns:
+            JobResponse containing all collected JobPost objects.
+        Raises:
+            IndeedException: On unrecoverable HTTP or parsing errors.
+        """
+        session = create_session(
+            proxies=scraper_input.proxies,
+            ca_cert=scraper_input.ca_cert,
+            is_tls=True,
+        )
+        # Override User-Agent if provided
+        headers = dict(INDEED_HEADERS)
+        if scraper_input.user_agent:
+            headers["User-Agent"] = scraper_input.user_agent
+        # Warm up the session to acquire cookies (avoids 403 on search)
+        base_url = f"https://{scraper_input.country_indeed.value}.indeed.com/"
+        try:
+            session.get(base_url, headers=headers)
+            time.sleep(random.uniform(2.0, 4.0))
+        except Exception:
+            pass
+        jobs: list[JobPost] = []
+        start = scraper_input.offset
+        while len(jobs) < scraper_input.results_wanted:
+            params = self._build_params(scraper_input, start)
+            search_url = JOBS_SEARCH_URL.format(
+                country=scraper_input.country_indeed.value
+            )
+            try:
+                response = session.get(search_url, headers=headers, params=params)
+            except Exception as exc:
+                raise IndeedException(
+                    f"Failed to fetch Indeed search page: {exc}"
+                ) from exc
+            status = getattr(response, "status_code", None)
+            if isinstance(status, int) and status >= 400:
+                raise IndeedException(
+                    f"Indeed returned HTTP {status} for search request. "
+                    "Bot detection may be blocking requests."
+                )
+            html = (
+                response.text
+                if hasattr(response, "text")
+                else response.content.decode()
+            )
+            job_dicts = parse_mosaic_json(html)
+            if not job_dicts:
+                logger.warning(
+                    "No jobs parsed from Indeed response (start=%d). "
+                    "Page may be a bot-check or the structure may have changed.",
+                    start,
+                )
+            if not job_dicts:
+                logger.info("No job dicts found on page (start=%d); stopping.", start)
+                break
+            for raw in job_dicts:
+                if len(jobs) >= scraper_input.results_wanted:
+                    break
+                job = self._build_job_post(raw, scraper_input, session, headers)
+                if job:
+                    jobs.append(job)
+            # Exit early if Indeed returned a partial page (last page)
+            if len(job_dicts) < _PAGE_SIZE:
+                break
+            start += _PAGE_SIZE
+            time.sleep(random.uniform(0.5, 2.5))
+        return JobResponse(jobs=jobs)
+    # ------------------------------------------------------------------
+    # Private helpers
+    # ------------------------------------------------------------------
+    def _build_params(self, scraper_input: ScraperInput, start: int) -> dict[str, Any]:
+        """Build the query parameters for an Indeed search URL."""
+        params: dict[str, Any] = {
+            "q": scraper_input.search_term,
+            "start": start,
+        }
+        if scraper_input.location:
+            params["l"] = scraper_input.location
+        if scraper_input.distance is not None:
+            params["radius"] = scraper_input.distance
+        if scraper_input.hours_old is not None:
+            params["fromage"] = scraper_input.hours_old // 24 or 1
+        return params
+    def _build_job_post(
+        self,
+        raw: dict[str, Any],
+        scraper_input: ScraperInput,
+        session: Any,
+        headers: dict[str, str],
+    ) -> JobPost | None:
+        """Convert a raw Indeed job dict to a JobPost.
+        Uses field-level try/except so partial data never crashes the scraper.
+        Logs warnings for missing optional fields.
+        """
+        try:
+            job_key = raw.get("jobkey") or raw.get("jobKey") or ""
+            if not job_key:
+                logger.warning("Job dict missing jobkey, skipping: %s", raw)
+                return None
+            job_url = get_job_detail_url(job_key, scraper_input.country_indeed)
+            # Title
+            try:
+                title = raw["title"]
+            except KeyError:
+                logger.warning("Job %s missing title", job_key)
+                return None
+            # Company
+            try:
+                company = raw.get("company") or raw.get("companyName")
+            except Exception:
+                company = None
+                logger.warning("Job %s: could not parse company", job_key)
+            # Location
+            try:
+                raw_loc = raw.get("formattedLocation") or raw.get("location") or ""
+                location = parse_location(raw_loc) if raw_loc else None
+            except Exception:
+                location = None
+                logger.warning("Job %s: could not parse location", job_key)
+            # Date posted
+            try:
+                ts = raw.get("pubDate") or raw.get("datePosted")
+                if ts:
+                    date_posted = datetime.fromtimestamp(int(ts) / 1000).date()
+                else:
+                    date_posted = None
+            except Exception:
+                date_posted = None
+                logger.warning("Job %s: could not parse date_posted", job_key)
+            # Job type
+            try:
+                raw_type = raw.get("jobTypes") or []
+                job_type = [JOB_TYPE_MAP["fulltime"]]
+                if isinstance(raw_type, list) and raw_type:
+                    job_type = [
+                        JOB_TYPE_MAP[t.lower()]
+                        for t in raw_type
+                        if t.lower() in JOB_TYPE_MAP
+                    ] or None
+            except Exception:
+                job_type = None
+                logger.warning("Job %s: could not parse job_type", job_key)
+            # Compensation
+            try:
+                compensation = parse_compensation(raw)
+            except Exception:
+                compensation = None
+                logger.warning("Job %s: could not parse compensation", job_key)
+            # Remote
+            try:
+                is_remote = bool(raw.get("remoteLocation") or raw.get("remote"))
+            except Exception:
+                is_remote = None
+            # Indeed Apply vs external apply
+            # indeedApplyEnabled=True means apply happens on Indeed itself.
+            # thirdPartyApplyUrl is the external ATS link when present.
+            try:
+                indeed_apply_flag = raw.get("indeedApplyEnabled")
+                third_party_url = raw.get("thirdPartyApplyUrl") or None
+                if indeed_apply_flag is not None:
+                    is_indeed_apply: bool | None = bool(indeed_apply_flag)
+                elif third_party_url:
+                    is_indeed_apply = False
+                else:
+                    is_indeed_apply = None
+            except Exception:
+                is_indeed_apply = None
+                third_party_url = None
+            # Description & emails
+            description: str | None = None
+            emails: list[str] | None = None
+            # Seed job_url_direct from mosaic data if available
+            job_url_direct: str | None = third_party_url if not is_indeed_apply else None
+            if scraper_input.fetch_full_description:
+                try:
+                    detail_resp = session.get(job_url, headers=headers)
+                    detail_html = (
+                        detail_resp.text
+                        if hasattr(detail_resp, "text")
+                        else detail_resp.content.decode()
+                    )
+                    from bs4 import BeautifulSoup
+                    soup = BeautifulSoup(detail_html, "lxml")
+                    desc_tag = soup.find("div", {"id": "jobDescriptionText"})
+                    if desc_tag:
+                        raw_html = str(desc_tag)
+                        if scraper_input.description_format == "markdown":
+                            description = markdown_converter(raw_html)
+                        else:
+                            description = raw_html
+                    emails = extract_emails(detail_html) or None
+                    # Try to get the direct apply URL
+                    apply_tag = soup.find("a", {"id": "applyButton"}) or soup.find(
+                        "a", {"data-testid": "applyButton"}
+                    )
+                    if apply_tag and apply_tag.get("href"):
+                        job_url_direct = str(apply_tag["href"])
+                    time.sleep(random.uniform(0.5, 2.5))
+                except Exception as exc:
+                    logger.warning(
+                        "Job %s: failed to fetch detail page: %s", job_key, exc
+                    )
+            # Company URL / logo + scrape emails from company site
+            try:
+                raw_company_url = raw.get("companyOverviewLink") or ""
+                if raw_company_url.startswith("https"):
+                    company_url = raw_company_url
+                elif company:
+                    company_url = get_company_website(company)
+                else:
+                    company_url = None
+                if company_url:
+                    try:
+                        co_resp = session.get(company_url, headers=headers)
+                        co_html = (
+                            co_resp.text
+                            if hasattr(co_resp, "text")
+                            else co_resp.content.decode()
+                        )
+                        co_emails = extract_emails(co_html) or []
+                        if co_emails:
+                            existing = set(emails or [])
+                            emails = list(existing | set(co_emails)) or None
+                    except Exception as exc:
+                        logger.warning(
+                            "Job %s: failed to scrape emails from company site %s: %s",
+                            job_key, company_url, exc,
+                        )
+            except Exception:
+                company_url = None
+            try:
+                company_logo = (
+                    raw.get("companyBrandingAttributes", {}).get("logoUrl") or None
+                )
+            except Exception:
+                company_logo = None
+            return JobPost(
+                id=job_key,
+                site=Site.INDEED,
+                job_url=job_url,
+                job_url_direct=job_url_direct,
+                title=title,
+                company=company,
+                location=location,
+                date_posted=date_posted,
+                job_type=job_type,
+                compensation=compensation,
+                is_remote=is_remote,
+                is_indeed_apply=is_indeed_apply,
+                description=description,
+                emails=emails,
+                company_url=company_url,
+                company_logo=company_logo,
+            )
+        except Exception as exc:
+            logger.warning("Unexpected error building JobPost: %s", exc)
+            return None

jobscraper/indeed/constant.py ADDED Viewed

@@ -0,0 +1,38 @@
+"""Constants for the Indeed scraper: headers, URLs, and job type mappings."""
+from __future__ import annotations
+from jobscraper.model import JobType
+INDEED_HEADERS: dict[str, str] = {
+    "User-Agent": (
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+        "AppleWebKit/537.36 (KHTML, like Gecko) "
+        "Chrome/120.0.0.0 Safari/537.36"
+    ),
+    "Accept-Language": "en-US,en;q=0.9",
+    "Accept": (
+        "text/html,application/xhtml+xml,application/xml;"
+        "q=0.9,image/avif,image/webp,*/*;q=0.8"
+    ),
+    "Referer": "https://in.indeed.com/",
+    "Accept-Encoding": "gzip, deflate, br",
+    "Connection": "keep-alive",
+    "Upgrade-Insecure-Requests": "1",
+}
+BASE_URL: str = "https://{country}.indeed.com"
+JOBS_SEARCH_URL: str = BASE_URL + "/jobs"
+JOB_TYPE_MAP: dict[str, JobType] = {
+    "fulltime": JobType.FULL_TIME,
+    "full-time": JobType.FULL_TIME,
+    "parttime": JobType.PART_TIME,
+    "part-time": JobType.PART_TIME,
+    "contract": JobType.CONTRACT,
+    "contractor": JobType.CONTRACT,
+    "temporary": JobType.TEMPORARY,
+    "temp": JobType.TEMPORARY,
+    "internship": JobType.INTERNSHIP,
+    "intern": JobType.INTERNSHIP,
+}