yaams 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- yaams/__init__.py +6 -0
- yaams/_default_config.yaml +129 -0
- yaams/cli/__init__.py +22 -0
- yaams/cli/_envelope.py +167 -0
- yaams/cli/_root.py +57 -0
- yaams/cli/_shared.py +107 -0
- yaams/cli/assoc.py +224 -0
- yaams/cli/consolidate.py +190 -0
- yaams/cli/doctor.py +182 -0
- yaams/cli/enrich.py +85 -0
- yaams/cli/entities.py +1402 -0
- yaams/cli/ingest.py +923 -0
- yaams/cli/main.py +354 -0
- yaams/cli/promote.py +232 -0
- yaams/cli/query.py +594 -0
- yaams/cli/review.py +180 -0
- yaams/cli/signals.py +143 -0
- yaams/cli/sources.py +1173 -0
- yaams/config.py +158 -0
- yaams/consolidate/__init__.py +19 -0
- yaams/consolidate/session.py +201 -0
- yaams/conventions.py +241 -0
- yaams/db.py +61 -0
- yaams/enrich/__init__.py +5 -0
- yaams/enrich/embed.py +78 -0
- yaams/enrich/entities.py +156 -0
- yaams/ingest/__init__.py +4 -0
- yaams/ingest/base.py +34 -0
- yaams/ingest/calendar.py +85 -0
- yaams/ingest/email_mbox.py +491 -0
- yaams/ingest/folder.py +300 -0
- yaams/ingest/github.py +360 -0
- yaams/ingest/imessage.py +280 -0
- yaams/ingest/ledger_notes.py +66 -0
- yaams/ingest/m365_mail.py +257 -0
- yaams/ingest/obsidian.py +143 -0
- yaams/ingest/signal.py +552 -0
- yaams/ingest/teams.py +353 -0
- yaams/ingest/teams_chatsvc.py +399 -0
- yaams/logsetup.py +77 -0
- yaams/people_import.py +256 -0
- yaams/promote/__init__.py +4 -0
- yaams/promote/candidates.py +355 -0
- yaams/promote/review.py +98 -0
- yaams/render.py +130 -0
- yaams/retrieve/__init__.py +19 -0
- yaams/retrieve/associate.py +232 -0
- yaams/retrieve/hybrid.py +559 -0
- yaams/retrieve/metadata.py +64 -0
- yaams/retrieve/parse.py +377 -0
- yaams/retrieve/route.py +137 -0
- yaams/retrieve/synonyms.py +93 -0
- yaams/schema.py +377 -0
- yaams/signals/__init__.py +37 -0
- yaams/signals/logger.py +160 -0
- yaams/signals/review.py +749 -0
- yaams/store.py +803 -0
- yaams/synthesize/__init__.py +29 -0
- yaams/synthesize/answer.py +232 -0
- yaams/synthesize/llm.py +266 -0
- yaams/time.py +31 -0
- yaams/watermark.py +36 -0
- yaams-0.3.0.dist-info/METADATA +308 -0
- yaams-0.3.0.dist-info/RECORD +68 -0
- yaams-0.3.0.dist-info/WHEEL +5 -0
- yaams-0.3.0.dist-info/entry_points.txt +2 -0
- yaams-0.3.0.dist-info/licenses/LICENSE +21 -0
- yaams-0.3.0.dist-info/top_level.txt +1 -0
yaams/__init__.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# YAAMS configuration. Copy to config.yaml and fill in your own values.
|
|
2
|
+
# See README.md and .plans/ for what each block does.
|
|
3
|
+
|
|
4
|
+
db_path: ~/yaams/data.db
|
|
5
|
+
|
|
6
|
+
ingest:
|
|
7
|
+
since: '2025-01-01T00:00:00Z'
|
|
8
|
+
|
|
9
|
+
imessage:
|
|
10
|
+
enabled: true
|
|
11
|
+
chat_db_path: ~/Library/Messages/chat.db
|
|
12
|
+
|
|
13
|
+
email:
|
|
14
|
+
enabled: true
|
|
15
|
+
sources:
|
|
16
|
+
# Apple Mail .emlx tree (point at the newest ~/Library/Mail/Vxx)
|
|
17
|
+
- type: emlx
|
|
18
|
+
path: ~/Library/Mail/V10
|
|
19
|
+
# Or one or more .mbox exports:
|
|
20
|
+
# - type: mbox
|
|
21
|
+
# path: ~/Downloads/all-mail.mbox
|
|
22
|
+
skip_newsletters: true
|
|
23
|
+
# Emails From these addresses are always kept regardless of List-* headers
|
|
24
|
+
# or noreply patterns. Add every address you send from across roles.
|
|
25
|
+
user_addresses:
|
|
26
|
+
- you@example.com
|
|
27
|
+
|
|
28
|
+
# Personal Obsidian vault. One vault per source; honors frontmatter
|
|
29
|
+
# `date:` / `created:` or YYYY-MM-DD filename prefixes for timestamps,
|
|
30
|
+
# strips embeds/wikilinks/frontmatter before indexing. Set `vault_path`
|
|
31
|
+
# via the `yaams sources` TUI or by editing it here.
|
|
32
|
+
notes:
|
|
33
|
+
enabled: false
|
|
34
|
+
vault_path: ~/Documents/Obsidian
|
|
35
|
+
# Optional. Directory names to skip anywhere in the vault tree.
|
|
36
|
+
# Defaults to .obsidian, .git, .smartchats, .smart-env, .claude.
|
|
37
|
+
# skip_dirs: [.obsidian, .git, archive]
|
|
38
|
+
|
|
39
|
+
# Generic recursive folder ingestion. Reads .txt and .md natively;
|
|
40
|
+
# .pdf needs `pip install pypdf`, .docx needs `pip install python-docx`.
|
|
41
|
+
# Files of unsupported types (or types with the dep missing) are skipped.
|
|
42
|
+
# Paths can be added/removed from the `yaams sources` TUI.
|
|
43
|
+
folders:
|
|
44
|
+
enabled: false
|
|
45
|
+
paths: []
|
|
46
|
+
# - ~/Documents/notes
|
|
47
|
+
# - ~/work/specs
|
|
48
|
+
# Optional. Defaults to .txt, .md, .markdown, .pdf, .docx
|
|
49
|
+
# extensions: [.txt, .md, .pdf]
|
|
50
|
+
# Optional. Directory names to skip anywhere in the tree.
|
|
51
|
+
# skip_dirs: [.git, .obsidian, node_modules]
|
|
52
|
+
|
|
53
|
+
# ---- Microsoft 365 / Graph sources ----
|
|
54
|
+
# All M365 sources authenticate via owa-piggy. Configure profiles once with
|
|
55
|
+
# `owa-piggy setup --profile <name>` and reference them by alias below.
|
|
56
|
+
# `calendar` and `mail` are profile-keyed and produce one yaams source per
|
|
57
|
+
# profile (e.g. mail_work, calendar_work). `teams` likewise.
|
|
58
|
+
|
|
59
|
+
teams:
|
|
60
|
+
enabled: false
|
|
61
|
+
profiles:
|
|
62
|
+
- work
|
|
63
|
+
skip_bots: true
|
|
64
|
+
page_size: 50
|
|
65
|
+
|
|
66
|
+
# owa-mail-backed inbox + sent ingestion. One source per profile.
|
|
67
|
+
# `folders` defaults to [Inbox, SentItems] — add Archive/etc. if wanted.
|
|
68
|
+
mail:
|
|
69
|
+
enabled: false
|
|
70
|
+
profiles:
|
|
71
|
+
- work
|
|
72
|
+
folders:
|
|
73
|
+
- Inbox
|
|
74
|
+
- SentItems
|
|
75
|
+
skip_newsletters: true
|
|
76
|
+
# Date range is sliced into chunks to stay under owa-mail's 200/req cap.
|
|
77
|
+
chunk_days: 30
|
|
78
|
+
# Addresses you send from. Messages From these are never filtered as
|
|
79
|
+
# newsletters/automated even if subject patterns would otherwise match.
|
|
80
|
+
user_addresses: []
|
|
81
|
+
# - you@example.com
|
|
82
|
+
|
|
83
|
+
# calendar:
|
|
84
|
+
# enabled: false
|
|
85
|
+
# profiles:
|
|
86
|
+
# - work
|
|
87
|
+
# skip_free: true
|
|
88
|
+
|
|
89
|
+
embed:
|
|
90
|
+
model: BAAI/bge-m3
|
|
91
|
+
batch_size: 32
|
|
92
|
+
# Apple Silicon: mps. Otherwise: cpu. CUDA also works if torch sees it.
|
|
93
|
+
device: mps
|
|
94
|
+
dimension: 1024
|
|
95
|
+
# Use only the locally cached HF snapshot; no network calls per run.
|
|
96
|
+
# Set to false the first time you change `model` so the new weights download.
|
|
97
|
+
offline: true
|
|
98
|
+
# Where HF stores model weights (sets HF_HOME). Defaults to
|
|
99
|
+
# ~/.local/share/huggingface so multi-GB weights survive `~/.cache` wipes.
|
|
100
|
+
# Respects an externally set $HF_HOME if you'd rather configure it that way.
|
|
101
|
+
# models_dir: ~/.local/share/huggingface
|
|
102
|
+
|
|
103
|
+
# LLM backend for synthesis (and future query parsing).
|
|
104
|
+
# Used by `yaams query --answer ...`. Off by default (dummy backend); pick
|
|
105
|
+
# one of:
|
|
106
|
+
# backend: ollama # local Ollama server
|
|
107
|
+
# model: llama3.1
|
|
108
|
+
# host: http://localhost:11434
|
|
109
|
+
# backend: subprocess # any CLI that takes prompt on stdin and prints answer
|
|
110
|
+
# command: ["codex", "exec", "--prompt-stdin"]
|
|
111
|
+
# # or: ["claude", "-p"]
|
|
112
|
+
# backend: dummy # no synthesis (also default if section omitted)
|
|
113
|
+
synth:
|
|
114
|
+
backend: dummy
|
|
115
|
+
|
|
116
|
+
entities:
|
|
117
|
+
spacy_model: xx_ent_wiki_sm
|
|
118
|
+
dictionary:
|
|
119
|
+
# Known people, places, projects, and aliases for entity tagging.
|
|
120
|
+
# Surface form on the left, casual variants under aliases.
|
|
121
|
+
- canonical: Example Person
|
|
122
|
+
type: person
|
|
123
|
+
aliases:
|
|
124
|
+
- Ex
|
|
125
|
+
- E. Person
|
|
126
|
+
- canonical: Example Org
|
|
127
|
+
type: org
|
|
128
|
+
aliases:
|
|
129
|
+
- EX
|
yaams/cli/__init__.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from yaams.cli import consolidate as _consolidate_mod # noqa: F401
|
|
4
|
+
from yaams.cli import assoc, enrich, entities, main, promote, query, review, signals, sources # noqa: F401
|
|
5
|
+
from yaams.cli import ingest as _ingest_mod # noqa: F401
|
|
6
|
+
from yaams.cli._root import cli
|
|
7
|
+
from yaams.cli._shared import _format_duration
|
|
8
|
+
from yaams.cli.consolidate import consolidate
|
|
9
|
+
from yaams.cli.ingest import _record_ingest_run, ingest
|
|
10
|
+
from yaams.cli.main import init_db, reset_db
|
|
11
|
+
from yaams.cli.query import query_cmd
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"cli",
|
|
15
|
+
"init_db",
|
|
16
|
+
"reset_db",
|
|
17
|
+
"ingest",
|
|
18
|
+
"query_cmd",
|
|
19
|
+
"consolidate",
|
|
20
|
+
"_format_duration",
|
|
21
|
+
"_record_ingest_run",
|
|
22
|
+
]
|
yaams/cli/_envelope.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""JSON failure envelope guard for YAAMS data-class commands.
|
|
2
|
+
|
|
3
|
+
Background (Plan 06)
|
|
4
|
+
--------------------
|
|
5
|
+
Action-class commands (init-db, setup, ingest, reset-db) already wrap
|
|
6
|
+
their ``load_config`` call in a try/except and emit an action envelope
|
|
7
|
+
on failure. Data-class commands (``query``, ``stats``) historically
|
|
8
|
+
called ``load_config`` outside any try block, so a missing or malformed
|
|
9
|
+
config produced a raw Python traceback on stderr - exit 1 with no JSON
|
|
10
|
+
on stdout. Hugr's passthrough wrapper sees that as "tool crashed" and
|
|
11
|
+
hides the underlying config error from the user.
|
|
12
|
+
|
|
13
|
+
This module gives data commands a single, uniform way to satisfy the
|
|
14
|
+
hugr CLI contract for ``--json``: stdout is exactly one line of valid
|
|
15
|
+
JSON, ok=false on failure, exit code mapped from CONVENTIONS.md.
|
|
16
|
+
|
|
17
|
+
Usage
|
|
18
|
+
-----
|
|
19
|
+
from yaams.cli._envelope import JsonFailureGuard
|
|
20
|
+
|
|
21
|
+
@cli.command(...)
|
|
22
|
+
def query(..., as_json: bool):
|
|
23
|
+
with JsonFailureGuard("query", as_json=as_json):
|
|
24
|
+
cfg = load_config(config_path)
|
|
25
|
+
... # rest of the command body
|
|
26
|
+
|
|
27
|
+
The guard is a no-op when ``as_json`` is False - human-mode callers
|
|
28
|
+
keep seeing the traceback so debugging stays easy.
|
|
29
|
+
"""
|
|
30
|
+
from __future__ import annotations
|
|
31
|
+
|
|
32
|
+
import sqlite3
|
|
33
|
+
import sys
|
|
34
|
+
import traceback
|
|
35
|
+
from contextlib import contextmanager
|
|
36
|
+
from typing import Iterator, TextIO
|
|
37
|
+
|
|
38
|
+
from yaams.conventions import (
|
|
39
|
+
EXIT_NOT_FOUND,
|
|
40
|
+
EXIT_USER_ERROR,
|
|
41
|
+
data_error,
|
|
42
|
+
emit_data_error,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _classify(exc: BaseException) -> tuple[str, str | None, int]:
|
|
47
|
+
"""Map an exception to (error_code, hint, exit_code).
|
|
48
|
+
|
|
49
|
+
Known classes get a stable code so hugr and other callers can branch
|
|
50
|
+
on it; everything else falls into ``unhandled`` with EXIT_USER_ERROR.
|
|
51
|
+
|
|
52
|
+
Exit codes follow CONVENTIONS.md:
|
|
53
|
+
- EXIT_NOT_FOUND (4) for missing config/db files
|
|
54
|
+
- EXIT_USER_ERROR (1) for malformed config and other user-recoverable
|
|
55
|
+
errors
|
|
56
|
+
"""
|
|
57
|
+
# YAML parse errors land before FileNotFoundError checks because some
|
|
58
|
+
# yaml.YAMLError subclasses also subclass OSError on certain platforms;
|
|
59
|
+
# we want the parser error to take priority.
|
|
60
|
+
try:
|
|
61
|
+
import yaml # local import: yaml is a runtime dep but importing
|
|
62
|
+
# at module top wires it into every import chain.
|
|
63
|
+
except ImportError: # pragma: no cover - PyYAML is required at runtime
|
|
64
|
+
yaml = None # type: ignore[assignment]
|
|
65
|
+
|
|
66
|
+
if yaml is not None and isinstance(exc, yaml.YAMLError):
|
|
67
|
+
return (
|
|
68
|
+
"config_invalid",
|
|
69
|
+
"Fix the YAML syntax in your config file; `python -m yaml < path` "
|
|
70
|
+
"shows the parse error.",
|
|
71
|
+
EXIT_USER_ERROR,
|
|
72
|
+
)
|
|
73
|
+
if isinstance(exc, FileNotFoundError):
|
|
74
|
+
# Distinguish config-not-found from db-file-not-found by inspecting
|
|
75
|
+
# the missing-file path. ``FileNotFoundError`` carries the filename
|
|
76
|
+
# in ``.filename`` when raised via pathlib/open; fall back to the
|
|
77
|
+
# message for hand-raised cases.
|
|
78
|
+
target = getattr(exc, "filename", None) or str(exc)
|
|
79
|
+
target_lower = target.lower()
|
|
80
|
+
if (
|
|
81
|
+
target_lower.endswith(".yaml")
|
|
82
|
+
or target_lower.endswith(".yml")
|
|
83
|
+
or target_lower.endswith(".yaml'")
|
|
84
|
+
or target_lower.endswith(".yml'")
|
|
85
|
+
or "/yaams/config." in target_lower
|
|
86
|
+
or "/hugr/yaams/config." in target_lower
|
|
87
|
+
or "config" in target_lower
|
|
88
|
+
):
|
|
89
|
+
return (
|
|
90
|
+
"config_not_found",
|
|
91
|
+
"Run `hugr init` to generate a config, or pass --config <path>.",
|
|
92
|
+
EXIT_NOT_FOUND,
|
|
93
|
+
)
|
|
94
|
+
return (
|
|
95
|
+
"db_open_failed",
|
|
96
|
+
"Run `yaams init-db` to create the database.",
|
|
97
|
+
EXIT_NOT_FOUND,
|
|
98
|
+
)
|
|
99
|
+
if isinstance(exc, sqlite3.OperationalError):
|
|
100
|
+
return (
|
|
101
|
+
"db_open_failed",
|
|
102
|
+
"Run `yaams init-db` or check the db_path in your config.",
|
|
103
|
+
EXIT_USER_ERROR,
|
|
104
|
+
)
|
|
105
|
+
if isinstance(exc, ValueError):
|
|
106
|
+
# ``Config file must contain a mapping`` and ``Config is missing
|
|
107
|
+
# db_path`` both surface as ValueError today.
|
|
108
|
+
return (
|
|
109
|
+
"config_invalid",
|
|
110
|
+
"Check your config.yaml against the example in the repo.",
|
|
111
|
+
EXIT_USER_ERROR,
|
|
112
|
+
)
|
|
113
|
+
return ("unhandled", None, EXIT_USER_ERROR)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@contextmanager
|
|
117
|
+
def JsonFailureGuard(
|
|
118
|
+
command: str,
|
|
119
|
+
*,
|
|
120
|
+
as_json: bool,
|
|
121
|
+
stdout: TextIO | None = None,
|
|
122
|
+
stderr: TextIO | None = None,
|
|
123
|
+
) -> Iterator[None]:
|
|
124
|
+
"""Wrap a data-command body so failures become JSON envelopes.
|
|
125
|
+
|
|
126
|
+
Parameters
|
|
127
|
+
----------
|
|
128
|
+
command:
|
|
129
|
+
The command name as it appears in the envelope (e.g. ``"query"``,
|
|
130
|
+
``"stats"``). Used verbatim, matching the existing action-envelope
|
|
131
|
+
convention.
|
|
132
|
+
as_json:
|
|
133
|
+
When False the guard is a no-op (the wrapped body runs and any
|
|
134
|
+
exception propagates normally). When True, exceptions inside the
|
|
135
|
+
block are caught, mapped to a ``data_error`` envelope, written as
|
|
136
|
+
one line of JSON to ``stdout``, and the process exits with the
|
|
137
|
+
code from ``_classify``.
|
|
138
|
+
stdout, stderr:
|
|
139
|
+
Optional injection points for tests. Default to ``sys.stdout`` and
|
|
140
|
+
``sys.stderr``.
|
|
141
|
+
|
|
142
|
+
The body's own ``sys.exit`` / ``SystemExit`` is allowed to escape
|
|
143
|
+
unchanged - the guard only catches non-SystemExit exceptions.
|
|
144
|
+
"""
|
|
145
|
+
if not as_json:
|
|
146
|
+
yield
|
|
147
|
+
return
|
|
148
|
+
|
|
149
|
+
out = stdout if stdout is not None else sys.stdout
|
|
150
|
+
err = stderr if stderr is not None else sys.stderr
|
|
151
|
+
|
|
152
|
+
try:
|
|
153
|
+
yield
|
|
154
|
+
except SystemExit:
|
|
155
|
+
raise
|
|
156
|
+
except BaseException as exc: # noqa: BLE001 - guard is the catch-all
|
|
157
|
+
code, hint, exit_code = _classify(exc)
|
|
158
|
+
envelope = data_error(
|
|
159
|
+
command=command,
|
|
160
|
+
code=code,
|
|
161
|
+
message=str(exc),
|
|
162
|
+
hint=hint,
|
|
163
|
+
)
|
|
164
|
+
emit_data_error(envelope, stream=out)
|
|
165
|
+
# Traceback to stderr only, so stdout stays a single JSON line.
|
|
166
|
+
traceback.print_exception(type(exc), exc, exc.__traceback__, file=err)
|
|
167
|
+
sys.exit(exit_code)
|
yaams/cli/_root.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import click
|
|
4
|
+
|
|
5
|
+
from yaams import __version__
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@click.group(invoke_without_command=True)
|
|
9
|
+
@click.version_option(__version__, prog_name="yaams")
|
|
10
|
+
@click.option(
|
|
11
|
+
"--doctor",
|
|
12
|
+
is_flag=True,
|
|
13
|
+
default=False,
|
|
14
|
+
help="Run health check and exit (data class; pair with --json for machine output).",
|
|
15
|
+
)
|
|
16
|
+
@click.option(
|
|
17
|
+
"--json",
|
|
18
|
+
"as_json_top",
|
|
19
|
+
is_flag=True,
|
|
20
|
+
default=False,
|
|
21
|
+
help="Machine mode for top-level --doctor (subcommand form: `yaams doctor --json`).",
|
|
22
|
+
)
|
|
23
|
+
@click.option(
|
|
24
|
+
"--config",
|
|
25
|
+
"config_path_top",
|
|
26
|
+
default=None,
|
|
27
|
+
help="Path to config.yaml. Honored by top-level --doctor; subcommands take their own --config.",
|
|
28
|
+
)
|
|
29
|
+
@click.pass_context
|
|
30
|
+
def cli(ctx: click.Context, doctor: bool, as_json_top: bool, config_path_top: str | None) -> None:
|
|
31
|
+
if doctor:
|
|
32
|
+
from yaams.cli.doctor import emit_doctor
|
|
33
|
+
ctx.exit(emit_doctor(config_path_top, as_json_top))
|
|
34
|
+
if ctx.invoked_subcommand is None:
|
|
35
|
+
click.echo(ctx.get_help())
|
|
36
|
+
ctx.exit(0)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@cli.command("doctor")
|
|
40
|
+
@click.option(
|
|
41
|
+
"--config",
|
|
42
|
+
"config_path",
|
|
43
|
+
default=None,
|
|
44
|
+
help="Path to config.yaml.",
|
|
45
|
+
)
|
|
46
|
+
@click.option(
|
|
47
|
+
"--json",
|
|
48
|
+
"as_json",
|
|
49
|
+
is_flag=True,
|
|
50
|
+
default=False,
|
|
51
|
+
help="Emit the doctor payload as JSON (machine mode).",
|
|
52
|
+
)
|
|
53
|
+
def doctor_cmd(config_path: str | None, as_json: bool) -> None:
|
|
54
|
+
"""Run health check (subcommand alias for --doctor)."""
|
|
55
|
+
from yaams.cli.doctor import emit_doctor
|
|
56
|
+
exit_code = emit_doctor(config_path, as_json)
|
|
57
|
+
raise click.exceptions.Exit(exit_code)
|
yaams/cli/_shared.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Iterable
|
|
7
|
+
|
|
8
|
+
import click
|
|
9
|
+
|
|
10
|
+
from yaams.enrich import Embedder, EntityTagger
|
|
11
|
+
from yaams.ingest import Item
|
|
12
|
+
from yaams.schema import DEFAULT_EMBEDDING_DIM
|
|
13
|
+
|
|
14
|
+
# Where HF model weights live by default. We keep them out of `~/.cache`
|
|
15
|
+
# because they're durable, multi-GB artifacts, not regenerable cache.
|
|
16
|
+
DEFAULT_MODELS_DIR = "~/.local/share/huggingface"
|
|
17
|
+
|
|
18
|
+
_CONFIG_HELP = (
|
|
19
|
+
"Path to config.yaml. Auto-resolves from $YAAMS_CONFIG, "
|
|
20
|
+
"~/.config/yaams/config.yaml, or repo root if omitted."
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def config_option(f):
|
|
25
|
+
return click.option("--config", "config_path", default=None, help=_CONFIG_HELP)(f)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _embed_config(cfg: dict) -> dict:
|
|
29
|
+
raw = dict(cfg.get("embed", {}))
|
|
30
|
+
model = raw.pop("model")
|
|
31
|
+
# Config wins; otherwise respect an externally set $HF_HOME; otherwise fall
|
|
32
|
+
# back to DEFAULT_MODELS_DIR so models survive `~/.cache` wipes.
|
|
33
|
+
if "models_dir" not in raw and not os.environ.get("HF_HOME"):
|
|
34
|
+
raw["models_dir"] = DEFAULT_MODELS_DIR
|
|
35
|
+
return {"model": model, **raw}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _embedding_dim(cfg: dict) -> int:
|
|
39
|
+
return int(cfg.get("embed", {}).get("dimension", DEFAULT_EMBEDDING_DIM))
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _entities_config(cfg: dict) -> dict:
|
|
43
|
+
return dict(cfg.get("entities", {}))
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _entity_dictionary(cfg: dict) -> list[dict]:
|
|
47
|
+
return list(_entities_config(cfg).get("dictionary", []))
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _progress(iterable: Iterable[Item], desc: str, unit: str = "it") -> Iterable[Item]:
|
|
51
|
+
try:
|
|
52
|
+
from tqdm import tqdm
|
|
53
|
+
|
|
54
|
+
return tqdm(iterable, desc=desc, unit=unit)
|
|
55
|
+
except ImportError:
|
|
56
|
+
return iterable
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _date(value: str | None) -> str:
|
|
60
|
+
if not value:
|
|
61
|
+
return "n/a"
|
|
62
|
+
return value[:10]
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _size_mb(path: Path) -> float:
|
|
66
|
+
return path.stat().st_size / (1024 * 1024)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _format_duration(ms: float) -> str:
|
|
70
|
+
if ms < 1000:
|
|
71
|
+
return f"{ms:.0f}ms"
|
|
72
|
+
seconds = ms / 1000
|
|
73
|
+
if seconds < 60:
|
|
74
|
+
return f"{seconds:.1f}s"
|
|
75
|
+
minutes, seconds = divmod(seconds, 60)
|
|
76
|
+
return f"{int(minutes)}m{seconds:04.1f}s"
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _format_throughput(seen: int, ms: float) -> str:
|
|
80
|
+
if ms <= 0 or seen <= 0:
|
|
81
|
+
return ""
|
|
82
|
+
rate = seen / (ms / 1000)
|
|
83
|
+
return f", {rate:,.1f} items/s"
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@dataclass
|
|
87
|
+
class ProcessingContext:
|
|
88
|
+
cfg: dict
|
|
89
|
+
_embedder: Embedder | None = field(default=None, init=False)
|
|
90
|
+
_tagger: EntityTagger | None = field(default=None, init=False)
|
|
91
|
+
|
|
92
|
+
@property
|
|
93
|
+
def embedder(self) -> Embedder:
|
|
94
|
+
if self._embedder is None:
|
|
95
|
+
self._embedder = Embedder(**_embed_config(self.cfg))
|
|
96
|
+
return self._embedder
|
|
97
|
+
|
|
98
|
+
@property
|
|
99
|
+
def tagger(self) -> EntityTagger:
|
|
100
|
+
if self._tagger is None:
|
|
101
|
+
ent_cfg = _entities_config(self.cfg)
|
|
102
|
+
self._tagger = EntityTagger(
|
|
103
|
+
ent_cfg.get("spacy_model"),
|
|
104
|
+
_entity_dictionary(self.cfg),
|
|
105
|
+
spacy_model_nb=ent_cfg.get("spacy_model_nb"),
|
|
106
|
+
)
|
|
107
|
+
return self._tagger
|