profgen 0.0.1rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
profgen/__init__.py ADDED
@@ -0,0 +1,43 @@
1
+ from importlib.metadata import PackageNotFoundError, version
2
+
3
+ from profgen.models import (
4
+ NOT_STATED,
5
+ Candidate,
6
+ EducationEntry,
7
+ EmploymentEntry,
8
+ ProjectEntry,
9
+ is_not_stated,
10
+ )
11
+ from profgen.pipeline import (
12
+ PipelineResult,
13
+ annotate_candidate,
14
+ check_grounding,
15
+ collect_missing_information,
16
+ render_review_report,
17
+ run_pipeline,
18
+ )
19
+
20
+ try:
21
+ # Change here if project is renamed and does not equal the package name
22
+ dist_name = __name__
23
+ __version__ = version(dist_name)
24
+ except PackageNotFoundError: # pragma: no cover
25
+ __version__ = "unknown"
26
+ finally:
27
+ del version, PackageNotFoundError
28
+
29
+ __all__ = [
30
+ "__version__",
31
+ "NOT_STATED",
32
+ "Candidate",
33
+ "EmploymentEntry",
34
+ "ProjectEntry",
35
+ "EducationEntry",
36
+ "is_not_stated",
37
+ "check_grounding",
38
+ "collect_missing_information",
39
+ "annotate_candidate",
40
+ "render_review_report",
41
+ "run_pipeline",
42
+ "PipelineResult",
43
+ ]
profgen/__main__.py ADDED
@@ -0,0 +1,6 @@
1
+ """Convenience wrapper for profgen to run directly from source tree."""
2
+
3
+ from profgen.cli import cli
4
+
5
+ if __name__ == "__main__":
6
+ cli()
profgen/_version.py ADDED
@@ -0,0 +1,24 @@
1
+ # file generated by vcs-versioning
2
+ # don't change, don't track in version control
3
+ from __future__ import annotations
4
+
5
+ __all__ = [
6
+ "__version__",
7
+ "__version_tuple__",
8
+ "version",
9
+ "version_tuple",
10
+ "__commit_id__",
11
+ "commit_id",
12
+ ]
13
+
14
+ version: str
15
+ __version__: str
16
+ __version_tuple__: tuple[int | str, ...]
17
+ version_tuple: tuple[int | str, ...]
18
+ commit_id: str | None
19
+ __commit_id__: str | None
20
+
21
+ __version__ = version = '0.0.1rc1'
22
+ __version_tuple__ = version_tuple = (0, 0, 1, 'rc1')
23
+
24
+ __commit_id__ = commit_id = None
profgen/cli.py ADDED
@@ -0,0 +1,143 @@
1
+ """Command-line interface for profgen / cv_formatter.
2
+
3
+ Exposes a Click command group with two subcommands:
4
+
5
+ * ``convert`` — convert a CV (PDF/DOCX/TXT) into a standardised Word profile.
6
+ * ``make-template`` — (re)generate the starter ``.docx`` template.
7
+
8
+ Both subcommands are fully wired: ``convert`` runs the end-to-end pipeline
9
+ (pass ``--offline`` for the deterministic, network-free path), and
10
+ ``make-template`` regenerates the starter ``.docx``. Both the ``profgen``
11
+ and ``cv-formatter`` console entry points resolve to :func:`cli`.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import logging
17
+ from pathlib import Path
18
+
19
+ import click
20
+
21
+ from profgen import __version__
22
+
23
+ __author__ = "Kevin Steptoe"
24
+ __license__ = "MIT"
25
+
26
+ _logger = logging.getLogger("profgen")
27
+
28
+ _LOG_FORMAT = "[%(asctime)s] %(levelname)s:%(name)s:%(message)s"
29
+ _DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
30
+
31
+
32
+ def _configure_logging(verbosity: int) -> None:
33
+ """Configure logging from a ``-v`` count (0=WARNING, 1=INFO, 2+=DEBUG)."""
34
+ level = {0: logging.WARNING, 1: logging.INFO}.get(verbosity, logging.DEBUG)
35
+ logging.basicConfig(level=level, format=_LOG_FORMAT, datefmt=_DATE_FORMAT)
36
+
37
+
38
+ @click.group(context_settings={"help_option_names": ["-h", "--help"]})
39
+ @click.version_option(__version__, "--version")
40
+ @click.option("-v", "--verbose", count=True, help="Increase verbosity (-v INFO, -vv DEBUG).")
41
+ def cli(verbose: int) -> None:
42
+ """profgen — convert candidate CVs into standardised Word profiles."""
43
+ _configure_logging(verbose)
44
+
45
+
46
+ @cli.command()
47
+ @click.argument("source", type=click.Path(exists=True, dir_okay=False, path_type=Path))
48
+ @click.option(
49
+ "--template",
50
+ type=click.Path(dir_okay=False, path_type=Path),
51
+ help="Bring-your-own .docx style-donor template.",
52
+ )
53
+ @click.option(
54
+ "--style-map",
55
+ "style_map_path",
56
+ type=click.Path(exists=True, dir_okay=False, path_type=Path),
57
+ help="TOML file mapping logical roles (title, date_heading, body, bullet, legal) to your template's style names.",
58
+ )
59
+ @click.option(
60
+ "--output",
61
+ type=click.Path(dir_okay=False, path_type=Path),
62
+ help=(
63
+ "Destination .docx profile (a sibling *.review.md is written alongside). "
64
+ "Defaults to <source-stem>_profile.docx in the current directory."
65
+ ),
66
+ )
67
+ @click.option(
68
+ "--offline",
69
+ is_flag=True,
70
+ help="Use the deterministic offline client (no API key, no network).",
71
+ )
72
+ def convert(
73
+ source: Path,
74
+ template: Path | None,
75
+ style_map_path: Path | None,
76
+ output: Path | None,
77
+ offline: bool,
78
+ ) -> None:
79
+ """Convert a CV into a standardised Word profile plus a *.review.md report.
80
+
81
+ The profile is written to ``--output`` (default ``<source-stem>_profile.docx``
82
+ in the current directory) and a sibling ``*.review.md`` review report is
83
+ written alongside it. Supply ``--template`` to render against your own
84
+ style-donor ``.docx`` and ``--style-map`` to map the renderer's logical roles
85
+ to that template's style names. With ``--offline`` the deterministic heuristic
86
+ client is used and no network access occurs; without it the Claude client is
87
+ used and needs ``ANTHROPIC_API_KEY``.
88
+ """
89
+ from profgen.llm import StructuringError
90
+ from profgen.pipeline import run_pipeline
91
+ from profgen.template import load_style_map
92
+
93
+ if output is None:
94
+ output = Path(f"{source.stem}_profile.docx")
95
+
96
+ style_map = load_style_map(style_map_path) if style_map_path is not None else None
97
+
98
+ try:
99
+ result = run_pipeline(
100
+ source,
101
+ output,
102
+ offline=offline,
103
+ template_path=template,
104
+ style_map=style_map,
105
+ )
106
+ except StructuringError as exc:
107
+ # The online path failed mid-call (transport / API / malformed response).
108
+ raise click.ClickException(
109
+ f"structuring failed: {exc}\nSet ANTHROPIC_API_KEY, or pass --offline to use the network-free client."
110
+ ) from exc
111
+ except Exception as exc: # noqa: BLE001 — turn any online setup failure into advice
112
+ # The Anthropic SDK raises a bare exception (e.g. a TypeError about
113
+ # authentication) when no API key is configured. The deterministic
114
+ # ``--offline`` path never reaches here, so this only guards the online
115
+ # path: re-raise it as a friendly hint rather than a traceback.
116
+ if offline:
117
+ raise
118
+ raise click.ClickException(
119
+ f"could not run the Claude client: {exc}\n"
120
+ "Set ANTHROPIC_API_KEY, or pass --offline to use the network-free client."
121
+ ) from exc
122
+
123
+ click.echo(f"Wrote profile to {result.profile_path}")
124
+ click.echo(f"Wrote review report to {result.review_path}")
125
+ if result.needs_verification:
126
+ click.echo(
127
+ f"Warning: some values need manual verification before customer submission — see {result.review_path}.",
128
+ err=True,
129
+ )
130
+
131
+
132
+ @cli.command(name="make-template")
133
+ @click.argument("path", type=click.Path(dir_okay=False, path_type=Path))
134
+ def make_template(path: Path) -> None:
135
+ """(Re)generate the starter .docx template carrying the default named styles."""
136
+ from profgen.template import make_template as _make_template
137
+
138
+ written = _make_template(path)
139
+ click.echo(f"Wrote starter template to {written}")
140
+
141
+
142
+ if __name__ == "__main__":
143
+ cli()
@@ -0,0 +1,58 @@
1
+ """Extraction stage: dispatch a source file to the right backend.
2
+
3
+ ``extract(path)`` is the public entry point. It chooses a backend by file
4
+ extension and returns an :class:`ExtractedDocument` of verbatim text.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+
11
+ from .base import ExtractedDocument, Extractor, ExtractorError, UnsupportedFormatError
12
+ from .docx import DocxExtractor
13
+ from .pdf import PdfExtractor
14
+ from .txt import TxtExtractor
15
+
16
+ # Extension -> extractor. Keys are lower-case; dispatch is case-insensitive.
17
+ _EXTRACTORS: dict[str, Extractor] = {
18
+ ".txt": TxtExtractor(),
19
+ ".text": TxtExtractor(),
20
+ ".docx": DocxExtractor(),
21
+ ".pdf": PdfExtractor(),
22
+ }
23
+
24
+ #: File extensions the dispatcher can handle.
25
+ SUPPORTED_SUFFIXES: tuple[str, ...] = tuple(sorted(_EXTRACTORS))
26
+
27
+
28
+ def get_extractor(path: Path | str) -> Extractor:
29
+ """Return the extractor registered for ``path``'s extension."""
30
+ suffix = Path(path).suffix.lower()
31
+ try:
32
+ return _EXTRACTORS[suffix]
33
+ except KeyError:
34
+ raise UnsupportedFormatError(
35
+ f"no extractor for {suffix or '(no extension)'}; supported: {', '.join(SUPPORTED_SUFFIXES)}"
36
+ ) from None
37
+
38
+
39
+ def extract(path: Path | str) -> ExtractedDocument:
40
+ """Extract verbatim text from a CV file, dispatching by extension."""
41
+ path = Path(path)
42
+ if not path.is_file():
43
+ raise FileNotFoundError(path)
44
+ return get_extractor(path).extract(path)
45
+
46
+
47
+ __all__ = [
48
+ "ExtractedDocument",
49
+ "Extractor",
50
+ "ExtractorError",
51
+ "UnsupportedFormatError",
52
+ "TxtExtractor",
53
+ "DocxExtractor",
54
+ "PdfExtractor",
55
+ "SUPPORTED_SUFFIXES",
56
+ "get_extractor",
57
+ "extract",
58
+ ]
@@ -0,0 +1,47 @@
1
+ """Core types for the extraction stage.
2
+
3
+ Extractors turn a source CV file into verbatim text. They do **no**
4
+ interpretation — they only read out the text the format contains, preserving
5
+ headings, dates, company names, skills and so on as faithfully as the format
6
+ allows. All downstream interpretation happens later, against this text.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from dataclasses import dataclass
12
+ from pathlib import Path
13
+ from typing import Protocol, runtime_checkable
14
+
15
+
16
+ class ExtractorError(Exception):
17
+ """Base class for extraction failures."""
18
+
19
+
20
+ class UnsupportedFormatError(ExtractorError):
21
+ """Raised when no extractor is registered for a file's extension."""
22
+
23
+
24
+ @dataclass(frozen=True)
25
+ class ExtractedDocument:
26
+ """The verbatim text of a source CV plus a little provenance.
27
+
28
+ ``text`` is faithful to the source. ``normalized_text`` collapses runs of
29
+ whitespace to single spaces, which is what the grounding check should match
30
+ against — PDF extraction in particular wraps lines and can split a
31
+ multi-word entity (e.g. ``"Quartus Prime"``) across a newline.
32
+ """
33
+
34
+ source_path: Path
35
+ text: str
36
+ backend: str
37
+
38
+ @property
39
+ def normalized_text(self) -> str:
40
+ return " ".join(self.text.split())
41
+
42
+
43
+ @runtime_checkable
44
+ class Extractor(Protocol):
45
+ """Anything that can turn a path into an :class:`ExtractedDocument`."""
46
+
47
+ def extract(self, path: Path) -> ExtractedDocument: ...
@@ -0,0 +1,43 @@
1
+ """Word (.docx) extractor backed by python-docx.
2
+
3
+ Walks the document body in order so paragraphs and tables come out in the same
4
+ sequence they appear on the page — table rows are flattened to tab-separated
5
+ text. This keeps the extracted text faithful to the source layout.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from collections.abc import Iterator
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ from .base import ExtractedDocument
15
+
16
+
17
+ def _iter_block_text(document: Any) -> Iterator[str]:
18
+ """Yield the text of each paragraph and table in document order."""
19
+ from docx.oxml.table import CT_Tbl
20
+ from docx.oxml.text.paragraph import CT_P
21
+ from docx.table import Table
22
+ from docx.text.paragraph import Paragraph
23
+
24
+ for child in document.element.body.iterchildren():
25
+ if isinstance(child, CT_P):
26
+ yield Paragraph(child, document).text
27
+ elif isinstance(child, CT_Tbl):
28
+ table = Table(child, document)
29
+ for row in table.rows:
30
+ yield "\t".join(cell.text for cell in row.cells)
31
+
32
+
33
+ class DocxExtractor:
34
+ """Read a ``.docx`` CV verbatim, including table content."""
35
+
36
+ backend = "python-docx"
37
+
38
+ def extract(self, path: Path) -> ExtractedDocument:
39
+ import docx
40
+
41
+ document = docx.Document(str(path))
42
+ text = "\n".join(_iter_block_text(document))
43
+ return ExtractedDocument(source_path=path, text=text, backend=self.backend)
@@ -0,0 +1,46 @@
1
+ """PDF extractor.
2
+
3
+ Defaults to the pdfplumber backend. pymupdf (``fitz``) is supported as an
4
+ optional, swappable backend — install the ``pdf-fast`` extra — and is typically
5
+ faster, but pdfplumber keeps the dependency surface minimal for the common case.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from pathlib import Path
11
+
12
+ from .base import ExtractedDocument, ExtractorError
13
+
14
+
15
+ def _extract_pdfplumber(path: Path) -> str:
16
+ import pdfplumber
17
+
18
+ with pdfplumber.open(str(path)) as pdf:
19
+ return "\n".join((page.extract_text() or "") for page in pdf.pages)
20
+
21
+
22
+ def _extract_pymupdf(path: Path) -> str:
23
+ import fitz # provided by pymupdf
24
+
25
+ with fitz.open(str(path)) as doc:
26
+ return "\n".join(page.get_text() for page in doc)
27
+
28
+
29
+ _BACKENDS = {"pdfplumber": _extract_pdfplumber, "pymupdf": _extract_pymupdf}
30
+
31
+
32
+ class PdfExtractor:
33
+ """Read a ``.pdf`` CV verbatim using the selected backend."""
34
+
35
+ def __init__(self, backend: str = "pdfplumber") -> None:
36
+ if backend not in _BACKENDS:
37
+ raise ExtractorError(f"unknown PDF backend {backend!r}; choose from {sorted(_BACKENDS)}")
38
+ self._backend = backend
39
+
40
+ @property
41
+ def backend(self) -> str:
42
+ return self._backend
43
+
44
+ def extract(self, path: Path) -> ExtractedDocument:
45
+ text = _BACKENDS[self._backend](path)
46
+ return ExtractedDocument(source_path=path, text=text, backend=self._backend)
@@ -0,0 +1,26 @@
1
+ """Plain-text extractor."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ from .base import ExtractedDocument
8
+
9
+
10
+ class TxtExtractor:
11
+ """Read a ``.txt`` CV verbatim.
12
+
13
+ Decodes as UTF-8, falling back to CP1252 (the common Windows encoding for
14
+ CVs exported from Word) so that smart quotes and dashes survive rather than
15
+ raising.
16
+ """
17
+
18
+ backend = "text"
19
+
20
+ def extract(self, path: Path) -> ExtractedDocument:
21
+ raw = path.read_bytes()
22
+ try:
23
+ text = raw.decode("utf-8")
24
+ except UnicodeDecodeError:
25
+ text = raw.decode("cp1252", errors="replace")
26
+ return ExtractedDocument(source_path=path, text=text, backend=self.backend)
@@ -0,0 +1,26 @@
1
+ """Structuring stage (stage 3): typed Candidate from extracted text.
2
+
3
+ Exposes the :class:`StructuringClient` protocol and its two implementations —
4
+ the production :class:`ClaudeStructuringClient` and the offline, deterministic
5
+ :class:`HeuristicStructuringClient` — plus the prompt constants that define the
6
+ forced-tool contract.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from .claude_client import (
12
+ ClaudeStructuringClient,
13
+ HeuristicStructuringClient,
14
+ StructuringClient,
15
+ StructuringError,
16
+ )
17
+ from .prompts import SYSTEM_PROMPT, TOOL_NAME
18
+
19
+ __all__ = [
20
+ "StructuringClient",
21
+ "ClaudeStructuringClient",
22
+ "HeuristicStructuringClient",
23
+ "StructuringError",
24
+ "SYSTEM_PROMPT",
25
+ "TOOL_NAME",
26
+ ]