PyPI - profgen - Versions diffs - 0.0.1rc1__py3-none-any.whl - Mend

profgen 0.0.1rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

profgen/__init__.py +43 -0
profgen/__main__.py +6 -0
profgen/_version.py +24 -0
profgen/cli.py +143 -0
profgen/extractors/__init__.py +58 -0
profgen/extractors/base.py +47 -0
profgen/extractors/docx.py +43 -0
profgen/extractors/pdf.py +46 -0
profgen/extractors/txt.py +26 -0
profgen/llm/__init__.py +26 -0
profgen/llm/claude_client.py +361 -0
profgen/llm/prompts.py +72 -0
profgen/models/__init__.py +21 -0
profgen/models/candidate.py +100 -0
profgen/pipeline.py +273 -0
profgen/template/__init__.py +37 -0
profgen/template/word_renderer.py +355 -0
profgen-0.0.1rc1.dist-info/METADATA +209 -0
profgen-0.0.1rc1.dist-info/RECORD +24 -0
profgen-0.0.1rc1.dist-info/WHEEL +5 -0
profgen-0.0.1rc1.dist-info/entry_points.txt +3 -0
profgen-0.0.1rc1.dist-info/licenses/AUTHORS.md +3 -0
profgen-0.0.1rc1.dist-info/licenses/LICENSE.txt +21 -0
profgen-0.0.1rc1.dist-info/top_level.txt +1 -0

profgen/__init__.py ADDED Viewed

@@ -0,0 +1,43 @@
+from importlib.metadata import PackageNotFoundError, version
+from profgen.models import (
+    NOT_STATED,
+    Candidate,
+    EducationEntry,
+    EmploymentEntry,
+    ProjectEntry,
+    is_not_stated,
+)
+from profgen.pipeline import (
+    PipelineResult,
+    annotate_candidate,
+    check_grounding,
+    collect_missing_information,
+    render_review_report,
+    run_pipeline,
+)
+try:
+    # Change here if project is renamed and does not equal the package name
+    dist_name = __name__
+    __version__ = version(dist_name)
+except PackageNotFoundError:  # pragma: no cover
+    __version__ = "unknown"
+finally:
+    del version, PackageNotFoundError
+__all__ = [
+    "__version__",
+    "NOT_STATED",
+    "Candidate",
+    "EmploymentEntry",
+    "ProjectEntry",
+    "EducationEntry",
+    "is_not_stated",
+    "check_grounding",
+    "collect_missing_information",
+    "annotate_candidate",
+    "render_review_report",
+    "run_pipeline",
+    "PipelineResult",
+]

profgen/__main__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Convenience wrapper for profgen to run directly from source tree."""
+from profgen.cli import cli
+if __name__ == "__main__":
+    cli()

profgen/_version.py ADDED Viewed

@@ -0,0 +1,24 @@
+# file generated by vcs-versioning
+# don't change, don't track in version control
+from __future__ import annotations
+__all__ = [
+    "__version__",
+    "__version_tuple__",
+    "version",
+    "version_tuple",
+    "__commit_id__",
+    "commit_id",
+]
+version: str
+__version__: str
+__version_tuple__: tuple[int | str, ...]
+version_tuple: tuple[int | str, ...]
+commit_id: str | None
+__commit_id__: str | None
+__version__ = version = '0.0.1rc1'
+__version_tuple__ = version_tuple = (0, 0, 1, 'rc1')
+__commit_id__ = commit_id = None

profgen/cli.py ADDED Viewed

@@ -0,0 +1,143 @@
+"""Command-line interface for profgen / cv_formatter.
+Exposes a Click command group with two subcommands:
+* ``convert``       — convert a CV (PDF/DOCX/TXT) into a standardised Word profile.
+* ``make-template`` — (re)generate the starter ``.docx`` template.
+Both subcommands are fully wired: ``convert`` runs the end-to-end pipeline
+(pass ``--offline`` for the deterministic, network-free path), and
+``make-template`` regenerates the starter ``.docx``. Both the ``profgen``
+and ``cv-formatter`` console entry points resolve to :func:`cli`.
+"""
+from __future__ import annotations
+import logging
+from pathlib import Path
+import click
+from profgen import __version__
+__author__ = "Kevin Steptoe"
+__license__ = "MIT"
+_logger = logging.getLogger("profgen")
+_LOG_FORMAT = "[%(asctime)s] %(levelname)s:%(name)s:%(message)s"
+_DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
+def _configure_logging(verbosity: int) -> None:
+    """Configure logging from a ``-v`` count (0=WARNING, 1=INFO, 2+=DEBUG)."""
+    level = {0: logging.WARNING, 1: logging.INFO}.get(verbosity, logging.DEBUG)
+    logging.basicConfig(level=level, format=_LOG_FORMAT, datefmt=_DATE_FORMAT)
+@click.group(context_settings={"help_option_names": ["-h", "--help"]})
+@click.version_option(__version__, "--version")
+@click.option("-v", "--verbose", count=True, help="Increase verbosity (-v INFO, -vv DEBUG).")
+def cli(verbose: int) -> None:
+    """profgen — convert candidate CVs into standardised Word profiles."""
+    _configure_logging(verbose)
+@cli.command()
+@click.argument("source", type=click.Path(exists=True, dir_okay=False, path_type=Path))
+@click.option(
+    "--template",
+    type=click.Path(dir_okay=False, path_type=Path),
+    help="Bring-your-own .docx style-donor template.",
+)
+@click.option(
+    "--style-map",
+    "style_map_path",
+    type=click.Path(exists=True, dir_okay=False, path_type=Path),
+    help="TOML file mapping logical roles (title, date_heading, body, bullet, legal) to your template's style names.",
+)
+@click.option(
+    "--output",
+    type=click.Path(dir_okay=False, path_type=Path),
+    help=(
+        "Destination .docx profile (a sibling *.review.md is written alongside). "
+        "Defaults to <source-stem>_profile.docx in the current directory."
+    ),
+)
+@click.option(
+    "--offline",
+    is_flag=True,
+    help="Use the deterministic offline client (no API key, no network).",
+)
+def convert(
+    source: Path,
+    template: Path | None,
+    style_map_path: Path | None,
+    output: Path | None,
+    offline: bool,
+) -> None:
+    """Convert a CV into a standardised Word profile plus a *.review.md report.
+    The profile is written to ``--output`` (default ``<source-stem>_profile.docx``
+    in the current directory) and a sibling ``*.review.md`` review report is
+    written alongside it. Supply ``--template`` to render against your own
+    style-donor ``.docx`` and ``--style-map`` to map the renderer's logical roles
+    to that template's style names. With ``--offline`` the deterministic heuristic
+    client is used and no network access occurs; without it the Claude client is
+    used and needs ``ANTHROPIC_API_KEY``.
+    """
+    from profgen.llm import StructuringError
+    from profgen.pipeline import run_pipeline
+    from profgen.template import load_style_map
+    if output is None:
+        output = Path(f"{source.stem}_profile.docx")
+    style_map = load_style_map(style_map_path) if style_map_path is not None else None
+    try:
+        result = run_pipeline(
+            source,
+            output,
+            offline=offline,
+            template_path=template,
+            style_map=style_map,
+        )
+    except StructuringError as exc:
+        # The online path failed mid-call (transport / API / malformed response).
+        raise click.ClickException(
+            f"structuring failed: {exc}\nSet ANTHROPIC_API_KEY, or pass --offline to use the network-free client."
+        ) from exc
+    except Exception as exc:  # noqa: BLE001 — turn any online setup failure into advice
+        # The Anthropic SDK raises a bare exception (e.g. a TypeError about
+        # authentication) when no API key is configured. The deterministic
+        # ``--offline`` path never reaches here, so this only guards the online
+        # path: re-raise it as a friendly hint rather than a traceback.
+        if offline:
+            raise
+        raise click.ClickException(
+            f"could not run the Claude client: {exc}\n"
+            "Set ANTHROPIC_API_KEY, or pass --offline to use the network-free client."
+        ) from exc
+    click.echo(f"Wrote profile to {result.profile_path}")
+    click.echo(f"Wrote review report to {result.review_path}")
+    if result.needs_verification:
+        click.echo(
+            f"Warning: some values need manual verification before customer submission — see {result.review_path}.",
+            err=True,
+        )
+@cli.command(name="make-template")
+@click.argument("path", type=click.Path(dir_okay=False, path_type=Path))
+def make_template(path: Path) -> None:
+    """(Re)generate the starter .docx template carrying the default named styles."""
+    from profgen.template import make_template as _make_template
+    written = _make_template(path)
+    click.echo(f"Wrote starter template to {written}")
+if __name__ == "__main__":
+    cli()

profgen/extractors/__init__.py ADDED Viewed

@@ -0,0 +1,58 @@
+"""Extraction stage: dispatch a source file to the right backend.
+``extract(path)`` is the public entry point. It chooses a backend by file
+extension and returns an :class:`ExtractedDocument` of verbatim text.
+"""
+from __future__ import annotations
+from pathlib import Path
+from .base import ExtractedDocument, Extractor, ExtractorError, UnsupportedFormatError
+from .docx import DocxExtractor
+from .pdf import PdfExtractor
+from .txt import TxtExtractor
+# Extension -> extractor. Keys are lower-case; dispatch is case-insensitive.
+_EXTRACTORS: dict[str, Extractor] = {
+    ".txt": TxtExtractor(),
+    ".text": TxtExtractor(),
+    ".docx": DocxExtractor(),
+    ".pdf": PdfExtractor(),
+}
+#: File extensions the dispatcher can handle.
+SUPPORTED_SUFFIXES: tuple[str, ...] = tuple(sorted(_EXTRACTORS))
+def get_extractor(path: Path | str) -> Extractor:
+    """Return the extractor registered for ``path``'s extension."""
+    suffix = Path(path).suffix.lower()
+    try:
+        return _EXTRACTORS[suffix]
+    except KeyError:
+        raise UnsupportedFormatError(
+            f"no extractor for {suffix or '(no extension)'}; supported: {', '.join(SUPPORTED_SUFFIXES)}"
+        ) from None
+def extract(path: Path | str) -> ExtractedDocument:
+    """Extract verbatim text from a CV file, dispatching by extension."""
+    path = Path(path)
+    if not path.is_file():
+        raise FileNotFoundError(path)
+    return get_extractor(path).extract(path)
+__all__ = [
+    "ExtractedDocument",
+    "Extractor",
+    "ExtractorError",
+    "UnsupportedFormatError",
+    "TxtExtractor",
+    "DocxExtractor",
+    "PdfExtractor",
+    "SUPPORTED_SUFFIXES",
+    "get_extractor",
+    "extract",
+]

profgen/extractors/base.py ADDED Viewed

@@ -0,0 +1,47 @@
+"""Core types for the extraction stage.
+Extractors turn a source CV file into verbatim text. They do **no**
+interpretation — they only read out the text the format contains, preserving
+headings, dates, company names, skills and so on as faithfully as the format
+allows. All downstream interpretation happens later, against this text.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Protocol, runtime_checkable
+class ExtractorError(Exception):
+    """Base class for extraction failures."""
+class UnsupportedFormatError(ExtractorError):
+    """Raised when no extractor is registered for a file's extension."""
+@dataclass(frozen=True)
+class ExtractedDocument:
+    """The verbatim text of a source CV plus a little provenance.
+    ``text`` is faithful to the source. ``normalized_text`` collapses runs of
+    whitespace to single spaces, which is what the grounding check should match
+    against — PDF extraction in particular wraps lines and can split a
+    multi-word entity (e.g. ``"Quartus Prime"``) across a newline.
+    """
+    source_path: Path
+    text: str
+    backend: str
+    @property
+    def normalized_text(self) -> str:
+        return " ".join(self.text.split())
+@runtime_checkable
+class Extractor(Protocol):
+    """Anything that can turn a path into an :class:`ExtractedDocument`."""
+    def extract(self, path: Path) -> ExtractedDocument: ...

profgen/extractors/docx.py ADDED Viewed

@@ -0,0 +1,43 @@
+"""Word (.docx) extractor backed by python-docx.
+Walks the document body in order so paragraphs and tables come out in the same
+sequence they appear on the page — table rows are flattened to tab-separated
+text. This keeps the extracted text faithful to the source layout.
+"""
+from __future__ import annotations
+from collections.abc import Iterator
+from pathlib import Path
+from typing import Any
+from .base import ExtractedDocument
+def _iter_block_text(document: Any) -> Iterator[str]:
+    """Yield the text of each paragraph and table in document order."""
+    from docx.oxml.table import CT_Tbl
+    from docx.oxml.text.paragraph import CT_P
+    from docx.table import Table
+    from docx.text.paragraph import Paragraph
+    for child in document.element.body.iterchildren():
+        if isinstance(child, CT_P):
+            yield Paragraph(child, document).text
+        elif isinstance(child, CT_Tbl):
+            table = Table(child, document)
+            for row in table.rows:
+                yield "\t".join(cell.text for cell in row.cells)
+class DocxExtractor:
+    """Read a ``.docx`` CV verbatim, including table content."""
+    backend = "python-docx"
+    def extract(self, path: Path) -> ExtractedDocument:
+        import docx
+        document = docx.Document(str(path))
+        text = "\n".join(_iter_block_text(document))
+        return ExtractedDocument(source_path=path, text=text, backend=self.backend)

profgen/extractors/pdf.py ADDED Viewed

@@ -0,0 +1,46 @@
+"""PDF extractor.
+Defaults to the pdfplumber backend. pymupdf (``fitz``) is supported as an
+optional, swappable backend — install the ``pdf-fast`` extra — and is typically
+faster, but pdfplumber keeps the dependency surface minimal for the common case.
+"""
+from __future__ import annotations
+from pathlib import Path
+from .base import ExtractedDocument, ExtractorError
+def _extract_pdfplumber(path: Path) -> str:
+    import pdfplumber
+    with pdfplumber.open(str(path)) as pdf:
+        return "\n".join((page.extract_text() or "") for page in pdf.pages)
+def _extract_pymupdf(path: Path) -> str:
+    import fitz  # provided by pymupdf
+    with fitz.open(str(path)) as doc:
+        return "\n".join(page.get_text() for page in doc)
+_BACKENDS = {"pdfplumber": _extract_pdfplumber, "pymupdf": _extract_pymupdf}
+class PdfExtractor:
+    """Read a ``.pdf`` CV verbatim using the selected backend."""
+    def __init__(self, backend: str = "pdfplumber") -> None:
+        if backend not in _BACKENDS:
+            raise ExtractorError(f"unknown PDF backend {backend!r}; choose from {sorted(_BACKENDS)}")
+        self._backend = backend
+    @property
+    def backend(self) -> str:
+        return self._backend
+    def extract(self, path: Path) -> ExtractedDocument:
+        text = _BACKENDS[self._backend](path)
+        return ExtractedDocument(source_path=path, text=text, backend=self._backend)

profgen/extractors/txt.py ADDED Viewed

@@ -0,0 +1,26 @@
+"""Plain-text extractor."""
+from __future__ import annotations
+from pathlib import Path
+from .base import ExtractedDocument
+class TxtExtractor:
+    """Read a ``.txt`` CV verbatim.
+    Decodes as UTF-8, falling back to CP1252 (the common Windows encoding for
+    CVs exported from Word) so that smart quotes and dashes survive rather than
+    raising.
+    """
+    backend = "text"
+    def extract(self, path: Path) -> ExtractedDocument:
+        raw = path.read_bytes()
+        try:
+            text = raw.decode("utf-8")
+        except UnicodeDecodeError:
+            text = raw.decode("cp1252", errors="replace")
+        return ExtractedDocument(source_path=path, text=text, backend=self.backend)

profgen/llm/__init__.py ADDED Viewed

@@ -0,0 +1,26 @@
+"""Structuring stage (stage 3): typed Candidate from extracted text.
+Exposes the :class:`StructuringClient` protocol and its two implementations —
+the production :class:`ClaudeStructuringClient` and the offline, deterministic
+:class:`HeuristicStructuringClient` — plus the prompt constants that define the
+forced-tool contract.
+"""
+from __future__ import annotations
+from .claude_client import (
+    ClaudeStructuringClient,
+    HeuristicStructuringClient,
+    StructuringClient,
+    StructuringError,
+)
+from .prompts import SYSTEM_PROMPT, TOOL_NAME
+__all__ = [
+    "StructuringClient",
+    "ClaudeStructuringClient",
+    "HeuristicStructuringClient",
+    "StructuringError",
+    "SYSTEM_PROMPT",
+    "TOOL_NAME",
+]