PyPI - python-job-scraper - Versions diffs - 0.3.0__py3-none-any.whl - Mend

python-job-scraper 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

jobscraper/__init__.py +302 -0
jobscraper/exception.py +32 -0
jobscraper/glassdoor/__init__.py +309 -0
jobscraper/glassdoor/constant.py +33 -0
jobscraper/glassdoor/util.py +215 -0
jobscraper/indeed/__init__.py +331 -0
jobscraper/indeed/constant.py +38 -0
jobscraper/indeed/util.py +157 -0
jobscraper/linkedin/__init__.py +5 -0
jobscraper/linkedin/_scraper.py +283 -0
jobscraper/linkedin/constant.py +60 -0
jobscraper/linkedin/util.py +331 -0
jobscraper/model.py +144 -0
jobscraper/util.py +500 -0
python_job_scraper-0.3.0.dist-info/METADATA +221 -0
python_job_scraper-0.3.0.dist-info/RECORD +19 -0
python_job_scraper-0.3.0.dist-info/WHEEL +5 -0
python_job_scraper-0.3.0.dist-info/licenses/LICENSE +21 -0
python_job_scraper-0.3.0.dist-info/top_level.txt +1 -0

jobscraper/__init__.py ADDED Viewed

@@ -0,0 +1,302 @@
+"""jobscraper — Multi-platform job scraping library.
+Public API:
+    scrape_jobs() — scrape job listings from one or more platforms and
+                    return results as a pandas DataFrame.
+"""
+from __future__ import annotations
+import re
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Any
+import pandas as pd
+from jobscraper.model import (
+    Country,
+    JobResponse,
+    JobType,
+    ScraperInput,
+    Site,
+)
+from jobscraper.util import (
+    convert_to_annual,
+    desired_order,
+    extract_salary,
+    get_enum_from_value,
+    map_str_to_site,
+    set_logger_level,
+)
+# ---------------------------------------------------------------------------
+# Scraper registry
+# ---------------------------------------------------------------------------
+from jobscraper.glassdoor import GlassdoorScraper  # noqa: E402
+from jobscraper.indeed import IndeedScraper  # noqa: E402
+from jobscraper.linkedin import LinkedInScraper  # noqa: E402
+SCRAPER_MAPPING: dict[Site, type] = {
+    Site.INDEED: IndeedScraper,
+    Site.GLASSDOOR: GlassdoorScraper,
+    Site.LINKEDIN: LinkedInScraper,
+}
+# ---------------------------------------------------------------------------
+# Public entry point
+# ---------------------------------------------------------------------------
+def scrape_jobs(
+    site_name: str | list[str] | Site | list[Site] | None = None,
+    search_term: str | None = None,
+    location: str | None = None,
+    distance: int | None = 50,
+    is_remote: bool = False,
+    job_type: str | None = None,
+    results_wanted: int = 20,
+    country_indeed: str = "india",
+    proxies: list[str] | str | None = None,
+    ca_cert: str | None = None,
+    description_format: str = "markdown",
+    offset: int | None = 0,
+    hours_old: int | None = None,
+    enforce_annual_salary: bool = False,
+    verbose: int = 0,
+    user_agent: str | None = None,
+    cookies: dict[str, str] | None = None,
+) -> pd.DataFrame:
+    """Scrape job listings from one or more platforms and return a DataFrame.
+    Args:
+        site_name: Platform(s) to scrape. Accepts a string, Site enum, or
+            a mixed list of both. Supports "indeed", "glassdoor", "linkedin".
+        search_term: Job title or keyword to search for.
+        location: City or region to filter jobs by.
+        distance: Search radius in km (default 50). Passed as-is to Indeed's
+            ``radius`` parameter.
+        is_remote: If True, filter for remote jobs only (not yet used by
+            Indeed in Phase 1; reserved for future scrapers).
+        job_type: Filter by job type string (e.g. "fulltime", "internship").
+        results_wanted: Maximum number of job results to return (default 20).
+        country_indeed: Country for the Indeed scraper (default "india").
+        proxies: Optional proxy URL string or list of proxy URL strings.
+        ca_cert: Path to a CA certificate bundle for HTTPS verification.
+        description_format: "markdown" (default) or "html" for job
+            description text format.
+        offset: Start from the Nth result (useful for pagination across
+            calls). Defaults to 0.
+        hours_old: Only return jobs posted within this many hours.
+        enforce_annual_salary: If True, normalize all salary intervals to
+            annual equivalents before returning.
+        verbose: Logging verbosity. 0=ERROR, 1=WARNING, 2=INFO.
+        user_agent: Override the default User-Agent header for Indeed.
+        cookies: Optional cookies dict to pass to scrapers (e.g. for
+            LinkedIn authentication).
+    Returns:
+        A pandas DataFrame with one row per job posting. Columns follow
+        ``desired_order``; all-NA columns are dropped.
+    Raises:
+        ValueError: If site_name is missing or an unsupported site is
+            requested.
+    """
+    set_logger_level(verbose)
+    if not site_name:
+        raise ValueError("site_name is required.")
+    # Coerce site_name to list[Site]
+    if isinstance(site_name, (str, Site)):
+        site_name = [site_name]
+    sites: list[Site] = []
+    for s in site_name:
+        if isinstance(s, str):
+            sites.append(map_str_to_site(s))
+        else:
+            sites.append(s)
+    # Coerce country_indeed
+    country = Country.from_string(country_indeed)
+    # Coerce job_type
+    job_type_enum: JobType | None = None
+    if job_type:
+        job_type_enum = get_enum_from_value(job_type)
+    # Normalize proxies to list
+    if isinstance(proxies, str):
+        proxies = [p.strip() for p in proxies.split(",") if p.strip()]
+    scraper_input = ScraperInput(
+        site_name=sites,
+        search_term=search_term or "",
+        location=location,
+        distance=distance,
+        hours_old=hours_old,
+        results_wanted=results_wanted,
+        offset=offset or 0,
+        job_type=job_type_enum,
+        is_remote=is_remote,
+        cookies=cookies,
+        country_indeed=country,
+        description_format=description_format,  # type: ignore[arg-type]
+        fetch_full_description=True,
+        proxies=proxies,
+        ca_cert=ca_cert,
+        enforce_annual_salary=enforce_annual_salary,
+        user_agent=user_agent,
+    )
+    # Dispatch scrapers via ThreadPoolExecutor
+    responses: list[JobResponse] = []
+    def _run(site: Site) -> JobResponse:
+        scraper_cls = SCRAPER_MAPPING.get(site)
+        if scraper_cls is None:
+            raise ValueError(f"No scraper implemented for site: {site}")
+        return scraper_cls().scrape(scraper_input)
+    with ThreadPoolExecutor(max_workers=len(sites)) as executor:
+        futures = {executor.submit(_run, site): site for site in sites}
+        for future in as_completed(futures):
+            try:
+                responses.append(future.result())
+            except Exception as exc:
+                site = futures[future]
+                import logging
+                logging.getLogger("jobscraper:main").warning(
+                    "Scraper for %s raised: %s", site, exc
+                )
+    if not responses or not any(r.jobs for r in responses):
+        return pd.DataFrame()
+    # Flatten all JobPost objects to dicts
+    rows: list[dict[str, Any]] = []
+    for response in responses:
+        for job in response.jobs:
+            rows.append(_flatten_job(job, enforce_annual_salary))
+    if not rows:
+        return pd.DataFrame()
+    df = pd.concat([pd.DataFrame([r]) for r in rows], ignore_index=True)
+    # Drop all-NA columns
+    df = df.dropna(axis=1, how="all")
+    # Ensure all desired_order columns are present (add None for missing)
+    for col in desired_order:
+        if col not in df.columns:
+            df[col] = None
+    # Reorder columns: desired_order first, then any extras
+    extra_cols = [c for c in df.columns if c not in desired_order]
+    df = df[[c for c in desired_order if c in df.columns] + extra_cols]
+    # Sort by site then date_posted descending
+    sort_cols = [c for c in ["site", "date_posted"] if c in df.columns]
+    if sort_cols:
+        df = df.sort_values(sort_cols, ascending=[True, False], na_position="last")
+    return df.reset_index(drop=True)
+# ---------------------------------------------------------------------------
+# Private helpers
+# ---------------------------------------------------------------------------
+def _flatten_job(job: Any, enforce_annual_salary: bool) -> dict[str, Any]:
+    """Flatten a JobPost into a dict suitable for a DataFrame row.
+    - compensation → min_amount, max_amount, interval, currency columns
+    - location → formatted string via display_location()
+    - job_type list → comma-joined string
+    - emails list → comma-joined string
+    - Falls back to extract_salary() on description when no compensation object
+    """
+    data: dict[str, Any] = {
+        "id": job.id,
+        "site": job.site.value if job.site else None,
+        "job_url": job.job_url,
+        "job_url_direct": job.job_url_direct,
+        "title": job.title,
+        "company": job.company,
+        "location": job.location.display_location() if job.location else None,
+        "date_posted": job.date_posted,
+        "is_remote": job.is_remote,
+        "is_indeed_apply": job.is_indeed_apply,
+        "job_level": job.job_level,
+        "company_url": job.company_url,
+        "company_logo": job.company_logo,
+        "description": job.description,
+        "emails": ", ".join(job.emails) if job.emails else None,
+        "job_type": (
+            ", ".join(jt.value for jt in job.job_type) if job.job_type else None
+        ),
+    }
+    # Compensation columns
+    salary_source: str | None = None
+    if job.compensation:
+        data["min_amount"] = job.compensation.min_amount
+        data["max_amount"] = job.compensation.max_amount
+        data["interval"] = (
+            job.compensation.interval.value if job.compensation.interval else None
+        )
+        data["currency"] = job.compensation.currency
+        salary_source = "DIRECT_DATA"
+    elif job.description:
+        # Fallback: extract salary from description text only when salary
+        # keywords or a currency symbol are present, to avoid false positives
+        # like "0–2 years of experience" being parsed as a salary range.
+        _salary_ctx = re.compile(
+            r"\b(salary|pay|stipend|ctc|lpa|lakh|lac|package|compensation"
+            r"|(?:per|a|an)\s+(?:hour|month|year|annum))\b"
+            r"|[$£€₹]",
+            re.IGNORECASE,
+        )
+        if _salary_ctx.search(job.description):
+            try:
+                interval_str, min_val, max_val, currency = extract_salary(
+                    job.description
+                )
+                data["min_amount"] = min_val
+                data["max_amount"] = max_val
+                data["interval"] = interval_str
+                data["currency"] = currency
+                if min_val is not None:
+                    salary_source = "DESCRIPTION"
+            except Exception:
+                data["min_amount"] = None
+                data["max_amount"] = None
+                data["interval"] = None
+                data["currency"] = None
+        else:
+            data["min_amount"] = None
+            data["max_amount"] = None
+            data["interval"] = None
+            data["currency"] = None
+    else:
+        data["min_amount"] = None
+        data["max_amount"] = None
+        data["interval"] = None
+        data["currency"] = None
+    data["salary_source"] = salary_source
+    # Enforce annual salary normalization
+    if (
+        enforce_annual_salary
+        and data.get("interval")
+        and data.get("min_amount") is not None
+    ):
+        convert_to_annual(data)
+    return data

jobscraper/exception.py ADDED Viewed

@@ -0,0 +1,32 @@
+"""Custom exceptions for jobscraper.
+This module defines platform-specific exception classes raised by scrapers
+when they encounter unrecoverable errors. Future platform exceptions are
+stubbed out as comments and will be uncommented as scrapers are added.
+"""
+from __future__ import annotations
+class IndeedException(Exception):
+    """Raised when the Indeed scraper encounters an unrecoverable error."""
+    def __init__(self, message: str | None = None):
+        super().__init__(message or "An error occurred with Indeed")
+class GlassdoorException(Exception):
+    """Raised when the Glassdoor scraper encounters an unrecoverable error."""
+    def __init__(self, message: str | None = None):
+        super().__init__(message or "An error occurred with Glassdoor")
+class LinkedInException(Exception):
+    """Raised when the LinkedIn scraper encounters an unrecoverable error."""
+    def __init__(self, message: str | None = None):
+        super().__init__(message or "An error occurred with LinkedIn")
+# class NaukriException(Exception): pass    # planned

jobscraper/glassdoor/__init__.py ADDED Viewed

@@ -0,0 +1,309 @@
+"""Glassdoor scraper implementation."""
+from __future__ import annotations
+import random
+import time
+from datetime import datetime
+from typing import Any
+from jobscraper.exception import GlassdoorException
+from jobscraper.glassdoor.constant import GLASSDOOR_HEADERS, JOB_TYPE_MAP
+from jobscraper.glassdoor.util import (
+    build_search_url,
+    extract_emails,
+    get_job_detail_url,
+    get_location_id,
+    parse_compensation,
+    parse_html_jobs,
+    parse_location,
+)
+from jobscraper.model import JobPost, JobResponse, Scraper, ScraperInput, Site
+from jobscraper.util import create_logger, create_session, markdown_converter
+logger = create_logger("glassdoor")
+_PAGE_SIZE = 27  # Glassdoor returns ~27 results per HTML page
+class GlassdoorScraper(Scraper):
+    """Scraper for glassdoor.co.in job listings.
+    Fetches search result pages as HTML and extracts job data from the
+    embedded RSC (React Server Components) JSON chunks. No GraphQL needed.
+    """
+    def scrape(self, scraper_input: ScraperInput) -> JobResponse:
+        """Fetch job listings from Glassdoor and return as a JobResponse.
+        Args:
+            scraper_input: Validated scraper configuration.
+        Returns:
+            JobResponse containing all collected JobPost objects.
+        Raises:
+            GlassdoorException: On unrecoverable HTTP errors.
+        """
+        session = create_session(
+            proxies=scraper_input.proxies,
+            ca_cert=scraper_input.ca_cert,
+            is_tls=True,
+        )
+        headers = dict(GLASSDOOR_HEADERS)
+        if scraper_input.user_agent:
+            headers["User-Agent"] = scraper_input.user_agent
+        # Warm up session to acquire cookies
+        try:
+            session.get(f"https://www.glassdoor.co.in", headers=headers)
+            time.sleep(random.uniform(1.0, 2.0))
+        except Exception:
+            pass
+        # Resolve location → slug + numeric ID
+        loc_slug, loc_id = "india", 115
+        if scraper_input.location:
+            result = get_location_id(session, headers, scraper_input.location)
+            if result:
+                loc_slug, loc_id = result
+            else:
+                logger.warning(
+                    "Could not resolve location '%s'; using fallback.",
+                    scraper_input.location,
+                )
+        jobs: list[JobPost] = []
+        start_page = (scraper_input.offset // _PAGE_SIZE) + 1
+        page = start_page
+        while len(jobs) < scraper_input.results_wanted:
+            url = build_search_url(
+                keyword=scraper_input.search_term,
+                location_slug=loc_slug,
+                location_id=loc_id,
+                page=page,
+            )
+            logger.info("Fetching Glassdoor page %d: %s", page, url)
+            try:
+                response = session.get(url, headers=headers)
+            except Exception as exc:
+                raise GlassdoorException(f"Failed to fetch Glassdoor page: {exc}") from exc
+            status = getattr(response, "status_code", None)
+            if isinstance(status, int) and status >= 400:
+                raise GlassdoorException(
+                    f"Glassdoor returned HTTP {status}. Bot detection may be active."
+                )
+            html = (
+                response.text
+                if hasattr(response, "text")
+                else response.content.decode()
+            )
+            raw_jobs = parse_html_jobs(html)
+            if not raw_jobs:
+                logger.info("No jobs parsed on page %d; stopping.", page)
+                break
+            for raw in raw_jobs:
+                if len(jobs) >= scraper_input.results_wanted:
+                    break
+                job = self._build_job_post(raw, scraper_input, session, headers)
+                if job:
+                    jobs.append(job)
+            if len(raw_jobs) < _PAGE_SIZE:
+                break
+            page += 1
+            time.sleep(random.uniform(0.5, 2.5))
+        return JobResponse(jobs=jobs)
+    # ------------------------------------------------------------------
+    # Private helpers
+    # ------------------------------------------------------------------
+    def _build_job_post(
+        self,
+        raw: dict[str, Any],
+        scraper_input: ScraperInput,
+        session: Any,
+        headers: dict[str, str],
+    ) -> JobPost | None:
+        """Convert a raw Glassdoor jobview dict to a JobPost.
+        Uses field-level try/except so partial data never crashes the scraper.
+        """
+        try:
+            header = raw.get("header", {})
+            job = raw.get("job", {})
+            overview = raw.get("overview", {})
+            # ID — listingId lives in job sub-dict
+            try:
+                listing_id = str(job.get("listingId") or "")
+                if not listing_id:
+                    logger.warning("Glassdoor job missing listingId, skipping")
+                    return None
+            except Exception:
+                return None
+            job_url = get_job_detail_url(listing_id)
+            # Title — in both header and job; prefer header
+            try:
+                title = header.get("jobTitleText") or job.get("jobTitleText")
+                if not title:
+                    logger.warning("Glassdoor job %s missing title", listing_id)
+                    return None
+            except KeyError:
+                return None
+            # Company
+            try:
+                company = (
+                    header.get("employerNameFromSearch")
+                    or header.get("employer", {}).get("name")
+                )
+            except Exception:
+                company = None
+            # Location
+            try:
+                location = parse_location(header.get("locationName") or "")
+            except Exception:
+                location = None
+            # Date posted — Glassdoor gives ageInDays, not a timestamp
+            try:
+                age = header.get("ageInDays")
+                if age is not None:
+                    from datetime import date, timedelta
+                    date_posted = date.today() - timedelta(days=int(age))
+                else:
+                    date_posted = None
+            except Exception:
+                date_posted = None
+            # Job type
+            try:
+                raw_types = job.get("jobTypes") or []
+                job_type = (
+                    [
+                        JOB_TYPE_MAP[t.lower()]
+                        for t in raw_types
+                        if t.lower() in JOB_TYPE_MAP
+                    ] or None
+                ) if raw_types else None
+            except Exception:
+                job_type = None
+            # Compensation
+            try:
+                compensation = parse_compensation(header)
+            except Exception:
+                compensation = None
+            # Remote
+            try:
+                is_remote = bool(job.get("isRemoteOrHybrid"))
+            except Exception:
+                is_remote = None
+            # Easy apply = Glassdoor's own apply flow
+            try:
+                is_indeed_apply: bool | None = bool(header.get("easyApply"))
+            except Exception:
+                is_indeed_apply = None
+            # Description (fragments available inline; full fetch optional)
+            description: str | None = None
+            emails: list[str] | None = None
+            job_url_direct: str | None = None
+            fragments = job.get("descriptionFragmentsText") or []
+            if fragments:
+                inline = " ".join(fragments)
+                description = (
+                    markdown_converter(f"<p>{inline}</p>")
+                    if scraper_input.description_format == "markdown"
+                    else inline
+                )
+            if scraper_input.fetch_full_description:
+                try:
+                    from bs4 import BeautifulSoup
+                    detail_resp = session.get(job_url, headers=headers)
+                    detail_html = (
+                        detail_resp.text
+                        if hasattr(detail_resp, "text")
+                        else detail_resp.content.decode()
+                    )
+                    soup = BeautifulSoup(detail_html, "lxml")
+                    desc_tag = (
+                        soup.find("div", {"class": "jobDescriptionContent"})
+                        or soup.find("div", {"id": "JobDescriptionContainer"})
+                        or soup.find("div", {"data-test": "jobDescriptionText"})
+                    )
+                    if desc_tag:
+                        raw_html = str(desc_tag)
+                        description = (
+                            markdown_converter(raw_html)
+                            if scraper_input.description_format == "markdown"
+                            else raw_html
+                        )
+                    emails = extract_emails(detail_html) or None
+                    time.sleep(random.uniform(0.5, 2.5))
+                except Exception as exc:
+                    logger.warning(
+                        "Job %s: failed to fetch detail page: %s", listing_id, exc
+                    )
+            # Company URL from employer ID
+            try:
+                employer_id = header.get("employer", {}).get("id") or None
+                company_url = (
+                    f"{_GD_BASE}/Overview/W-EI_IE{employer_id}.htm"
+                    if employer_id
+                    else None
+                )
+            except Exception:
+                company_url = None
+            # Company logo
+            try:
+                company_logo = overview.get("squareLogoUrl") or None
+            except Exception:
+                company_logo = None
+            return JobPost(
+                id=listing_id,
+                site=Site.GLASSDOOR,
+                job_url=job_url,
+                job_url_direct=job_url_direct,
+                title=title,
+                company=company,
+                location=location,
+                date_posted=date_posted,
+                job_type=job_type,
+                compensation=compensation,
+                is_remote=is_remote,
+                is_indeed_apply=is_indeed_apply,
+                description=description,
+                emails=emails,
+                company_url=company_url,
+                company_logo=company_logo,
+            )
+        except Exception as exc:
+            logger.warning("Unexpected error building Glassdoor JobPost: %s", exc)
+            return None
+_GD_BASE = "https://www.glassdoor.co.in"

jobscraper/glassdoor/constant.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""Constants for the Glassdoor scraper."""
+from __future__ import annotations
+from jobscraper.model import JobType
+BASE_URL = "https://www.glassdoor.co.in"
+GRAPHQL_URL = BASE_URL + "/graph"
+GLASSDOOR_HEADERS: dict[str, str] = {
+    "User-Agent": (
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+        "AppleWebKit/537.36 (KHTML, like Gecko) "
+        "Chrome/120.0.0.0 Safari/537.36"
+    ),
+    "Accept": "*/*",
+    "Accept-Language": "en-US,en;q=0.9",
+    "Content-Type": "application/json",
+    "Apollo-Requires-Preflight": "true",
+    "Referer": BASE_URL + "/",
+    "Origin": BASE_URL,
+}
+JOB_TYPE_MAP: dict[str, JobType] = {
+    "fulltime": JobType.FULL_TIME,
+    "full_time": JobType.FULL_TIME,
+    "parttime": JobType.PART_TIME,
+    "part_time": JobType.PART_TIME,
+    "contract": JobType.CONTRACT,
+    "temporary": JobType.TEMPORARY,
+    "internship": JobType.INTERNSHIP,
+    "intern": JobType.INTERNSHIP,
+}