yaams 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. yaams/__init__.py +6 -0
  2. yaams/_default_config.yaml +129 -0
  3. yaams/cli/__init__.py +22 -0
  4. yaams/cli/_envelope.py +167 -0
  5. yaams/cli/_root.py +57 -0
  6. yaams/cli/_shared.py +107 -0
  7. yaams/cli/assoc.py +224 -0
  8. yaams/cli/consolidate.py +190 -0
  9. yaams/cli/doctor.py +182 -0
  10. yaams/cli/enrich.py +85 -0
  11. yaams/cli/entities.py +1402 -0
  12. yaams/cli/ingest.py +923 -0
  13. yaams/cli/main.py +354 -0
  14. yaams/cli/promote.py +232 -0
  15. yaams/cli/query.py +594 -0
  16. yaams/cli/review.py +180 -0
  17. yaams/cli/signals.py +143 -0
  18. yaams/cli/sources.py +1173 -0
  19. yaams/config.py +158 -0
  20. yaams/consolidate/__init__.py +19 -0
  21. yaams/consolidate/session.py +201 -0
  22. yaams/conventions.py +241 -0
  23. yaams/db.py +61 -0
  24. yaams/enrich/__init__.py +5 -0
  25. yaams/enrich/embed.py +78 -0
  26. yaams/enrich/entities.py +156 -0
  27. yaams/ingest/__init__.py +4 -0
  28. yaams/ingest/base.py +34 -0
  29. yaams/ingest/calendar.py +85 -0
  30. yaams/ingest/email_mbox.py +491 -0
  31. yaams/ingest/folder.py +300 -0
  32. yaams/ingest/github.py +360 -0
  33. yaams/ingest/imessage.py +280 -0
  34. yaams/ingest/ledger_notes.py +66 -0
  35. yaams/ingest/m365_mail.py +257 -0
  36. yaams/ingest/obsidian.py +143 -0
  37. yaams/ingest/signal.py +552 -0
  38. yaams/ingest/teams.py +353 -0
  39. yaams/ingest/teams_chatsvc.py +399 -0
  40. yaams/logsetup.py +77 -0
  41. yaams/people_import.py +256 -0
  42. yaams/promote/__init__.py +4 -0
  43. yaams/promote/candidates.py +355 -0
  44. yaams/promote/review.py +98 -0
  45. yaams/render.py +130 -0
  46. yaams/retrieve/__init__.py +19 -0
  47. yaams/retrieve/associate.py +232 -0
  48. yaams/retrieve/hybrid.py +559 -0
  49. yaams/retrieve/metadata.py +64 -0
  50. yaams/retrieve/parse.py +377 -0
  51. yaams/retrieve/route.py +137 -0
  52. yaams/retrieve/synonyms.py +93 -0
  53. yaams/schema.py +377 -0
  54. yaams/signals/__init__.py +37 -0
  55. yaams/signals/logger.py +160 -0
  56. yaams/signals/review.py +749 -0
  57. yaams/store.py +803 -0
  58. yaams/synthesize/__init__.py +29 -0
  59. yaams/synthesize/answer.py +232 -0
  60. yaams/synthesize/llm.py +266 -0
  61. yaams/time.py +31 -0
  62. yaams/watermark.py +36 -0
  63. yaams-0.3.0.dist-info/METADATA +308 -0
  64. yaams-0.3.0.dist-info/RECORD +68 -0
  65. yaams-0.3.0.dist-info/WHEEL +5 -0
  66. yaams-0.3.0.dist-info/entry_points.txt +2 -0
  67. yaams-0.3.0.dist-info/licenses/LICENSE +21 -0
  68. yaams-0.3.0.dist-info/top_level.txt +1 -0
yaams/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ """YAAMS Phase A ingest package."""
2
+
3
+ __all__ = ["__version__"]
4
+
5
+ __version__ = "0.3.0"
6
+
@@ -0,0 +1,129 @@
1
+ # YAAMS configuration. Copy to config.yaml and fill in your own values.
2
+ # See README.md and .plans/ for what each block does.
3
+
4
+ db_path: ~/yaams/data.db
5
+
6
+ ingest:
7
+ since: '2025-01-01T00:00:00Z'
8
+
9
+ imessage:
10
+ enabled: true
11
+ chat_db_path: ~/Library/Messages/chat.db
12
+
13
+ email:
14
+ enabled: true
15
+ sources:
16
+ # Apple Mail .emlx tree (point at the newest ~/Library/Mail/Vxx)
17
+ - type: emlx
18
+ path: ~/Library/Mail/V10
19
+ # Or one or more .mbox exports:
20
+ # - type: mbox
21
+ # path: ~/Downloads/all-mail.mbox
22
+ skip_newsletters: true
23
+ # Emails From these addresses are always kept regardless of List-* headers
24
+ # or noreply patterns. Add every address you send from across roles.
25
+ user_addresses:
26
+ - you@example.com
27
+
28
+ # Personal Obsidian vault. One vault per source; honors frontmatter
29
+ # `date:` / `created:` or YYYY-MM-DD filename prefixes for timestamps,
30
+ # strips embeds/wikilinks/frontmatter before indexing. Set `vault_path`
31
+ # via the `yaams sources` TUI or by editing it here.
32
+ notes:
33
+ enabled: false
34
+ vault_path: ~/Documents/Obsidian
35
+ # Optional. Directory names to skip anywhere in the vault tree.
36
+ # Defaults to .obsidian, .git, .smartchats, .smart-env, .claude.
37
+ # skip_dirs: [.obsidian, .git, archive]
38
+
39
+ # Generic recursive folder ingestion. Reads .txt and .md natively;
40
+ # .pdf needs `pip install pypdf`, .docx needs `pip install python-docx`.
41
+ # Files of unsupported types (or types with the dep missing) are skipped.
42
+ # Paths can be added/removed from the `yaams sources` TUI.
43
+ folders:
44
+ enabled: false
45
+ paths: []
46
+ # - ~/Documents/notes
47
+ # - ~/work/specs
48
+ # Optional. Defaults to .txt, .md, .markdown, .pdf, .docx
49
+ # extensions: [.txt, .md, .pdf]
50
+ # Optional. Directory names to skip anywhere in the tree.
51
+ # skip_dirs: [.git, .obsidian, node_modules]
52
+
53
+ # ---- Microsoft 365 / Graph sources ----
54
+ # All M365 sources authenticate via owa-piggy. Configure profiles once with
55
+ # `owa-piggy setup --profile <name>` and reference them by alias below.
56
+ # `calendar` and `mail` are profile-keyed and produce one yaams source per
57
+ # profile (e.g. mail_work, calendar_work). `teams` likewise.
58
+
59
+ teams:
60
+ enabled: false
61
+ profiles:
62
+ - work
63
+ skip_bots: true
64
+ page_size: 50
65
+
66
+ # owa-mail-backed inbox + sent ingestion. One source per profile.
67
+ # `folders` defaults to [Inbox, SentItems] — add Archive/etc. if wanted.
68
+ mail:
69
+ enabled: false
70
+ profiles:
71
+ - work
72
+ folders:
73
+ - Inbox
74
+ - SentItems
75
+ skip_newsletters: true
76
+ # Date range is sliced into chunks to stay under owa-mail's 200/req cap.
77
+ chunk_days: 30
78
+ # Addresses you send from. Messages From these are never filtered as
79
+ # newsletters/automated even if subject patterns would otherwise match.
80
+ user_addresses: []
81
+ # - you@example.com
82
+
83
+ # calendar:
84
+ # enabled: false
85
+ # profiles:
86
+ # - work
87
+ # skip_free: true
88
+
89
+ embed:
90
+ model: BAAI/bge-m3
91
+ batch_size: 32
92
+ # Apple Silicon: mps. Otherwise: cpu. CUDA also works if torch sees it.
93
+ device: mps
94
+ dimension: 1024
95
+ # Use only the locally cached HF snapshot; no network calls per run.
96
+ # Set to false the first time you change `model` so the new weights download.
97
+ offline: true
98
+ # Where HF stores model weights (sets HF_HOME). Defaults to
99
+ # ~/.local/share/huggingface so multi-GB weights survive `~/.cache` wipes.
100
+ # Respects an externally set $HF_HOME if you'd rather configure it that way.
101
+ # models_dir: ~/.local/share/huggingface
102
+
103
+ # LLM backend for synthesis (and future query parsing).
104
+ # Used by `yaams query --answer ...`. Off by default (dummy backend); pick
105
+ # one of:
106
+ # backend: ollama # local Ollama server
107
+ # model: llama3.1
108
+ # host: http://localhost:11434
109
+ # backend: subprocess # any CLI that takes prompt on stdin and prints answer
110
+ # command: ["codex", "exec", "--prompt-stdin"]
111
+ # # or: ["claude", "-p"]
112
+ # backend: dummy # no synthesis (also default if section omitted)
113
+ synth:
114
+ backend: dummy
115
+
116
+ entities:
117
+ spacy_model: xx_ent_wiki_sm
118
+ dictionary:
119
+ # Known people, places, projects, and aliases for entity tagging.
120
+ # Surface form on the left, casual variants under aliases.
121
+ - canonical: Example Person
122
+ type: person
123
+ aliases:
124
+ - Ex
125
+ - E. Person
126
+ - canonical: Example Org
127
+ type: org
128
+ aliases:
129
+ - EX
yaams/cli/__init__.py ADDED
@@ -0,0 +1,22 @@
1
+ from __future__ import annotations
2
+
3
+ from yaams.cli import consolidate as _consolidate_mod # noqa: F401
4
+ from yaams.cli import assoc, enrich, entities, main, promote, query, review, signals, sources # noqa: F401
5
+ from yaams.cli import ingest as _ingest_mod # noqa: F401
6
+ from yaams.cli._root import cli
7
+ from yaams.cli._shared import _format_duration
8
+ from yaams.cli.consolidate import consolidate
9
+ from yaams.cli.ingest import _record_ingest_run, ingest
10
+ from yaams.cli.main import init_db, reset_db
11
+ from yaams.cli.query import query_cmd
12
+
13
+ __all__ = [
14
+ "cli",
15
+ "init_db",
16
+ "reset_db",
17
+ "ingest",
18
+ "query_cmd",
19
+ "consolidate",
20
+ "_format_duration",
21
+ "_record_ingest_run",
22
+ ]
yaams/cli/_envelope.py ADDED
@@ -0,0 +1,167 @@
1
+ """JSON failure envelope guard for YAAMS data-class commands.
2
+
3
+ Background (Plan 06)
4
+ --------------------
5
+ Action-class commands (init-db, setup, ingest, reset-db) already wrap
6
+ their ``load_config`` call in a try/except and emit an action envelope
7
+ on failure. Data-class commands (``query``, ``stats``) historically
8
+ called ``load_config`` outside any try block, so a missing or malformed
9
+ config produced a raw Python traceback on stderr - exit 1 with no JSON
10
+ on stdout. Hugr's passthrough wrapper sees that as "tool crashed" and
11
+ hides the underlying config error from the user.
12
+
13
+ This module gives data commands a single, uniform way to satisfy the
14
+ hugr CLI contract for ``--json``: stdout is exactly one line of valid
15
+ JSON, ok=false on failure, exit code mapped from CONVENTIONS.md.
16
+
17
+ Usage
18
+ -----
19
+ from yaams.cli._envelope import JsonFailureGuard
20
+
21
+ @cli.command(...)
22
+ def query(..., as_json: bool):
23
+ with JsonFailureGuard("query", as_json=as_json):
24
+ cfg = load_config(config_path)
25
+ ... # rest of the command body
26
+
27
+ The guard is a no-op when ``as_json`` is False - human-mode callers
28
+ keep seeing the traceback so debugging stays easy.
29
+ """
30
+ from __future__ import annotations
31
+
32
+ import sqlite3
33
+ import sys
34
+ import traceback
35
+ from contextlib import contextmanager
36
+ from typing import Iterator, TextIO
37
+
38
+ from yaams.conventions import (
39
+ EXIT_NOT_FOUND,
40
+ EXIT_USER_ERROR,
41
+ data_error,
42
+ emit_data_error,
43
+ )
44
+
45
+
46
+ def _classify(exc: BaseException) -> tuple[str, str | None, int]:
47
+ """Map an exception to (error_code, hint, exit_code).
48
+
49
+ Known classes get a stable code so hugr and other callers can branch
50
+ on it; everything else falls into ``unhandled`` with EXIT_USER_ERROR.
51
+
52
+ Exit codes follow CONVENTIONS.md:
53
+ - EXIT_NOT_FOUND (4) for missing config/db files
54
+ - EXIT_USER_ERROR (1) for malformed config and other user-recoverable
55
+ errors
56
+ """
57
+ # YAML parse errors land before FileNotFoundError checks because some
58
+ # yaml.YAMLError subclasses also subclass OSError on certain platforms;
59
+ # we want the parser error to take priority.
60
+ try:
61
+ import yaml # local import: yaml is a runtime dep but importing
62
+ # at module top wires it into every import chain.
63
+ except ImportError: # pragma: no cover - PyYAML is required at runtime
64
+ yaml = None # type: ignore[assignment]
65
+
66
+ if yaml is not None and isinstance(exc, yaml.YAMLError):
67
+ return (
68
+ "config_invalid",
69
+ "Fix the YAML syntax in your config file; `python -m yaml < path` "
70
+ "shows the parse error.",
71
+ EXIT_USER_ERROR,
72
+ )
73
+ if isinstance(exc, FileNotFoundError):
74
+ # Distinguish config-not-found from db-file-not-found by inspecting
75
+ # the missing-file path. ``FileNotFoundError`` carries the filename
76
+ # in ``.filename`` when raised via pathlib/open; fall back to the
77
+ # message for hand-raised cases.
78
+ target = getattr(exc, "filename", None) or str(exc)
79
+ target_lower = target.lower()
80
+ if (
81
+ target_lower.endswith(".yaml")
82
+ or target_lower.endswith(".yml")
83
+ or target_lower.endswith(".yaml'")
84
+ or target_lower.endswith(".yml'")
85
+ or "/yaams/config." in target_lower
86
+ or "/hugr/yaams/config." in target_lower
87
+ or "config" in target_lower
88
+ ):
89
+ return (
90
+ "config_not_found",
91
+ "Run `hugr init` to generate a config, or pass --config <path>.",
92
+ EXIT_NOT_FOUND,
93
+ )
94
+ return (
95
+ "db_open_failed",
96
+ "Run `yaams init-db` to create the database.",
97
+ EXIT_NOT_FOUND,
98
+ )
99
+ if isinstance(exc, sqlite3.OperationalError):
100
+ return (
101
+ "db_open_failed",
102
+ "Run `yaams init-db` or check the db_path in your config.",
103
+ EXIT_USER_ERROR,
104
+ )
105
+ if isinstance(exc, ValueError):
106
+ # ``Config file must contain a mapping`` and ``Config is missing
107
+ # db_path`` both surface as ValueError today.
108
+ return (
109
+ "config_invalid",
110
+ "Check your config.yaml against the example in the repo.",
111
+ EXIT_USER_ERROR,
112
+ )
113
+ return ("unhandled", None, EXIT_USER_ERROR)
114
+
115
+
116
+ @contextmanager
117
+ def JsonFailureGuard(
118
+ command: str,
119
+ *,
120
+ as_json: bool,
121
+ stdout: TextIO | None = None,
122
+ stderr: TextIO | None = None,
123
+ ) -> Iterator[None]:
124
+ """Wrap a data-command body so failures become JSON envelopes.
125
+
126
+ Parameters
127
+ ----------
128
+ command:
129
+ The command name as it appears in the envelope (e.g. ``"query"``,
130
+ ``"stats"``). Used verbatim, matching the existing action-envelope
131
+ convention.
132
+ as_json:
133
+ When False the guard is a no-op (the wrapped body runs and any
134
+ exception propagates normally). When True, exceptions inside the
135
+ block are caught, mapped to a ``data_error`` envelope, written as
136
+ one line of JSON to ``stdout``, and the process exits with the
137
+ code from ``_classify``.
138
+ stdout, stderr:
139
+ Optional injection points for tests. Default to ``sys.stdout`` and
140
+ ``sys.stderr``.
141
+
142
+ The body's own ``sys.exit`` / ``SystemExit`` is allowed to escape
143
+ unchanged - the guard only catches non-SystemExit exceptions.
144
+ """
145
+ if not as_json:
146
+ yield
147
+ return
148
+
149
+ out = stdout if stdout is not None else sys.stdout
150
+ err = stderr if stderr is not None else sys.stderr
151
+
152
+ try:
153
+ yield
154
+ except SystemExit:
155
+ raise
156
+ except BaseException as exc: # noqa: BLE001 - guard is the catch-all
157
+ code, hint, exit_code = _classify(exc)
158
+ envelope = data_error(
159
+ command=command,
160
+ code=code,
161
+ message=str(exc),
162
+ hint=hint,
163
+ )
164
+ emit_data_error(envelope, stream=out)
165
+ # Traceback to stderr only, so stdout stays a single JSON line.
166
+ traceback.print_exception(type(exc), exc, exc.__traceback__, file=err)
167
+ sys.exit(exit_code)
yaams/cli/_root.py ADDED
@@ -0,0 +1,57 @@
1
+ from __future__ import annotations
2
+
3
+ import click
4
+
5
+ from yaams import __version__
6
+
7
+
8
+ @click.group(invoke_without_command=True)
9
+ @click.version_option(__version__, prog_name="yaams")
10
+ @click.option(
11
+ "--doctor",
12
+ is_flag=True,
13
+ default=False,
14
+ help="Run health check and exit (data class; pair with --json for machine output).",
15
+ )
16
+ @click.option(
17
+ "--json",
18
+ "as_json_top",
19
+ is_flag=True,
20
+ default=False,
21
+ help="Machine mode for top-level --doctor (subcommand form: `yaams doctor --json`).",
22
+ )
23
+ @click.option(
24
+ "--config",
25
+ "config_path_top",
26
+ default=None,
27
+ help="Path to config.yaml. Honored by top-level --doctor; subcommands take their own --config.",
28
+ )
29
+ @click.pass_context
30
+ def cli(ctx: click.Context, doctor: bool, as_json_top: bool, config_path_top: str | None) -> None:
31
+ if doctor:
32
+ from yaams.cli.doctor import emit_doctor
33
+ ctx.exit(emit_doctor(config_path_top, as_json_top))
34
+ if ctx.invoked_subcommand is None:
35
+ click.echo(ctx.get_help())
36
+ ctx.exit(0)
37
+
38
+
39
+ @cli.command("doctor")
40
+ @click.option(
41
+ "--config",
42
+ "config_path",
43
+ default=None,
44
+ help="Path to config.yaml.",
45
+ )
46
+ @click.option(
47
+ "--json",
48
+ "as_json",
49
+ is_flag=True,
50
+ default=False,
51
+ help="Emit the doctor payload as JSON (machine mode).",
52
+ )
53
+ def doctor_cmd(config_path: str | None, as_json: bool) -> None:
54
+ """Run health check (subcommand alias for --doctor)."""
55
+ from yaams.cli.doctor import emit_doctor
56
+ exit_code = emit_doctor(config_path, as_json)
57
+ raise click.exceptions.Exit(exit_code)
yaams/cli/_shared.py ADDED
@@ -0,0 +1,107 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from dataclasses import dataclass, field
5
+ from pathlib import Path
6
+ from typing import Iterable
7
+
8
+ import click
9
+
10
+ from yaams.enrich import Embedder, EntityTagger
11
+ from yaams.ingest import Item
12
+ from yaams.schema import DEFAULT_EMBEDDING_DIM
13
+
14
+ # Where HF model weights live by default. We keep them out of `~/.cache`
15
+ # because they're durable, multi-GB artifacts, not regenerable cache.
16
+ DEFAULT_MODELS_DIR = "~/.local/share/huggingface"
17
+
18
+ _CONFIG_HELP = (
19
+ "Path to config.yaml. Auto-resolves from $YAAMS_CONFIG, "
20
+ "~/.config/yaams/config.yaml, or repo root if omitted."
21
+ )
22
+
23
+
24
+ def config_option(f):
25
+ return click.option("--config", "config_path", default=None, help=_CONFIG_HELP)(f)
26
+
27
+
28
+ def _embed_config(cfg: dict) -> dict:
29
+ raw = dict(cfg.get("embed", {}))
30
+ model = raw.pop("model")
31
+ # Config wins; otherwise respect an externally set $HF_HOME; otherwise fall
32
+ # back to DEFAULT_MODELS_DIR so models survive `~/.cache` wipes.
33
+ if "models_dir" not in raw and not os.environ.get("HF_HOME"):
34
+ raw["models_dir"] = DEFAULT_MODELS_DIR
35
+ return {"model": model, **raw}
36
+
37
+
38
+ def _embedding_dim(cfg: dict) -> int:
39
+ return int(cfg.get("embed", {}).get("dimension", DEFAULT_EMBEDDING_DIM))
40
+
41
+
42
+ def _entities_config(cfg: dict) -> dict:
43
+ return dict(cfg.get("entities", {}))
44
+
45
+
46
+ def _entity_dictionary(cfg: dict) -> list[dict]:
47
+ return list(_entities_config(cfg).get("dictionary", []))
48
+
49
+
50
+ def _progress(iterable: Iterable[Item], desc: str, unit: str = "it") -> Iterable[Item]:
51
+ try:
52
+ from tqdm import tqdm
53
+
54
+ return tqdm(iterable, desc=desc, unit=unit)
55
+ except ImportError:
56
+ return iterable
57
+
58
+
59
+ def _date(value: str | None) -> str:
60
+ if not value:
61
+ return "n/a"
62
+ return value[:10]
63
+
64
+
65
+ def _size_mb(path: Path) -> float:
66
+ return path.stat().st_size / (1024 * 1024)
67
+
68
+
69
+ def _format_duration(ms: float) -> str:
70
+ if ms < 1000:
71
+ return f"{ms:.0f}ms"
72
+ seconds = ms / 1000
73
+ if seconds < 60:
74
+ return f"{seconds:.1f}s"
75
+ minutes, seconds = divmod(seconds, 60)
76
+ return f"{int(minutes)}m{seconds:04.1f}s"
77
+
78
+
79
+ def _format_throughput(seen: int, ms: float) -> str:
80
+ if ms <= 0 or seen <= 0:
81
+ return ""
82
+ rate = seen / (ms / 1000)
83
+ return f", {rate:,.1f} items/s"
84
+
85
+
86
+ @dataclass
87
+ class ProcessingContext:
88
+ cfg: dict
89
+ _embedder: Embedder | None = field(default=None, init=False)
90
+ _tagger: EntityTagger | None = field(default=None, init=False)
91
+
92
+ @property
93
+ def embedder(self) -> Embedder:
94
+ if self._embedder is None:
95
+ self._embedder = Embedder(**_embed_config(self.cfg))
96
+ return self._embedder
97
+
98
+ @property
99
+ def tagger(self) -> EntityTagger:
100
+ if self._tagger is None:
101
+ ent_cfg = _entities_config(self.cfg)
102
+ self._tagger = EntityTagger(
103
+ ent_cfg.get("spacy_model"),
104
+ _entity_dictionary(self.cfg),
105
+ spacy_model_nb=ent_cfg.get("spacy_model_nb"),
106
+ )
107
+ return self._tagger