docent-python 0.1.0a1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docent-python might be problematic. Click here for more details.

@@ -0,0 +1,192 @@
1
+ **/*_gitignore.*
2
+ **/*_gitignore/
3
+ *.db
4
+ .stignore
5
+ *syncthing*
6
+ .DS_Store
7
+ # *.sql (neil: disabled for ursid)
8
+ *.gz
9
+
10
+ *.tfstate
11
+ *.tfstate.backup
12
+ */.terraform/
13
+ */*.terraform.*
14
+
15
+ # Byte-compiled / optimized / DLL files
16
+ __pycache__/
17
+ *.py[cod]
18
+ *$py.class
19
+
20
+ # C extensions
21
+ *.so
22
+
23
+ # Distribution / packaging
24
+ .Python
25
+ build/
26
+ develop-eggs/
27
+ dist/
28
+ downloads/
29
+ eggs/
30
+ .eggs/
31
+ parts/
32
+ sdist/
33
+ var/
34
+ wheels/
35
+ share/python-wheels/
36
+ *.egg-info/
37
+ .installed.cfg
38
+ *.egg
39
+ MANIFEST
40
+
41
+ # PyInstaller
42
+ # Usually these files are written by a python script from a template
43
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
44
+ *.manifest
45
+ *.spec
46
+
47
+ # Installer logs
48
+ pip-log.txt
49
+ pip-delete-this-directory.txt
50
+
51
+ # Unit test / coverage reports
52
+ htmlcov/
53
+ .tox/
54
+ .nox/
55
+ .coverage
56
+ .coverage.*
57
+ .cache
58
+ nosetests.xml
59
+ coverage.xml
60
+ *.cover
61
+ *.py,cover
62
+ .hypothesis/
63
+ .pytest_cache/
64
+ cover/
65
+
66
+ # Translations
67
+ *.mo
68
+ *.pot
69
+
70
+ # Django stuff:
71
+ *.log
72
+ local_settings.py
73
+ db.sqlite3
74
+ db.sqlite3-journal
75
+
76
+ # Flask stuff:
77
+ instance/
78
+ .webassets-cache
79
+
80
+ # Scrapy stuff:
81
+ .scrapy
82
+
83
+ # Sphinx documentation
84
+ docs/_build/
85
+
86
+ # PyBuilder
87
+ .pybuilder/
88
+ target/
89
+
90
+ # Jupyter Notebook
91
+ .ipynb_checkpoints
92
+
93
+ # IPython
94
+ profile_default/
95
+ ipython_config.py
96
+
97
+ # pyenv
98
+ # For a library or package, you might want to ignore these files since the code is
99
+ # intended to run in multiple environments; otherwise, check them in:
100
+ .python-version
101
+
102
+ # pipenv
103
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
104
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
105
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
106
+ # install all needed dependencies.
107
+ #Pipfile.lock
108
+
109
+ # poetry
110
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
111
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
112
+ # commonly ignored for libraries.
113
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
114
+ #poetry.lock
115
+
116
+ # pdm
117
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
118
+ #pdm.lock
119
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
120
+ # in version control.
121
+ # https://pdm.fming.dev/#use-with-ide
122
+ .pdm.toml
123
+
124
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
125
+ __pypackages__/
126
+
127
+ # Celery stuff
128
+ celerybeat-schedule
129
+ celerybeat.pid
130
+
131
+ # SageMath parsed files
132
+ *.sage.py
133
+
134
+ # Environments
135
+ .env
136
+ .venv
137
+ env/
138
+ venv/
139
+ ENV/
140
+ env.bak/
141
+ venv.bak/
142
+
143
+ # Spyder project settings
144
+ .spyderproject
145
+ .spyproject
146
+
147
+ # Rope project settings
148
+ .ropeproject
149
+
150
+ # mkdocs documentation
151
+ /site
152
+
153
+ # mypy
154
+ .mypy_cache/
155
+ .dmypy.json
156
+ dmypy.json
157
+
158
+ # Pyre type checker
159
+ .pyre/
160
+
161
+ # pytype static type analyzer
162
+ .pytype/
163
+
164
+ # Cython debug symbols
165
+ cython_debug/
166
+
167
+ # PyCharm
168
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
169
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
170
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
171
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
172
+ #.idea/
173
+
174
+ # wandb
175
+ **/wandb/
176
+
177
+ # Marimo notebook outputs
178
+ **/__marimo__/
179
+
180
+ # yarn
181
+ **/.yarn/
182
+ **/.pnp.*
183
+
184
+ # data
185
+ *.npy
186
+ *.csv
187
+ *.pkl
188
+ *.eval
189
+
190
+ # personal
191
+ personal/caden/*
192
+ inspect_evals
@@ -0,0 +1,7 @@
1
+ Copyright 2025 Clarity AI Research, Inc. dba Transluce
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,16 @@
1
+ Metadata-Version: 2.4
2
+ Name: docent-python
3
+ Version: 0.1.0a1
4
+ Summary: Docent SDK
5
+ Project-URL: Homepage, https://github.com/TransluceAI/docent
6
+ Project-URL: Issues, https://github.com/TransluceAI/docent/issues
7
+ Project-URL: Docs, https://transluce-docent.readthedocs-hosted.com/en/latest
8
+ Author-email: Transluce AI <info@transluce.org>
9
+ License-Expression: MIT
10
+ License-File: LICENSE.md
11
+ Requires-Python: >=3.11
12
+ Requires-Dist: logging>=0.4.9.6
13
+ Requires-Dist: pydantic>=2.11.7
14
+ Requires-Dist: pyyaml>=6.0.2
15
+ Requires-Dist: sqlalchemy>=2.0.41
16
+ Requires-Dist: tiktoken>=0.7.0
@@ -0,0 +1,3 @@
1
+ __all__ = ["Docent"]
2
+
3
+ from docent.sdk.client import Docent
@@ -0,0 +1,3 @@
1
+ __all__ = ["get_logger"]
2
+
3
+ from docent._log_util.logger import get_logger
@@ -0,0 +1,141 @@
1
+ import logging
2
+ import sys
3
+ from dataclasses import dataclass
4
+ from typing import Any, Dict, Literal, MutableMapping, Optional, Tuple
5
+
6
+
7
+ @dataclass
8
+ class ColorCode:
9
+ fore: str
10
+ style: str = ""
11
+
12
+
13
+ class Colors:
14
+ # Foreground colors
15
+ BLACK = ColorCode("\033[30m")
16
+ RED = ColorCode("\033[31m")
17
+ GREEN = ColorCode("\033[32m")
18
+ YELLOW = ColorCode("\033[33m")
19
+ BLUE = ColorCode("\033[34m")
20
+ MAGENTA = ColorCode("\033[35m")
21
+ CYAN = ColorCode("\033[36m")
22
+ WHITE = ColorCode("\033[37m")
23
+ BRIGHT_MAGENTA = ColorCode("\033[95m")
24
+ BRIGHT_CYAN = ColorCode("\033[96m")
25
+
26
+ # Styles
27
+ BOLD = "\033[1m"
28
+ RESET = "\033[0m"
29
+
30
+ @staticmethod
31
+ def apply(text: str, color: ColorCode) -> str:
32
+ return f"{color.style}{color.fore}{text}{Colors.RESET}"
33
+
34
+
35
+ class ColoredFormatter(logging.Formatter):
36
+ COLORS: Dict[int, ColorCode] = {
37
+ logging.DEBUG: Colors.BLUE,
38
+ logging.INFO: Colors.GREEN,
39
+ logging.WARNING: Colors.YELLOW,
40
+ logging.ERROR: Colors.RED,
41
+ logging.CRITICAL: ColorCode("\033[31m", Colors.BOLD),
42
+ }
43
+
44
+ # Available highlight colors
45
+ HIGHLIGHT_COLORS: Dict[str, ColorCode] = {
46
+ "magenta": ColorCode(Colors.BRIGHT_MAGENTA.fore, Colors.BOLD),
47
+ "cyan": ColorCode(Colors.BRIGHT_CYAN.fore, Colors.BOLD),
48
+ "yellow": ColorCode(Colors.YELLOW.fore, Colors.BOLD),
49
+ "red": ColorCode(Colors.RED.fore, Colors.BOLD),
50
+ }
51
+
52
+ def __init__(self, fmt: Optional[str] = None) -> None:
53
+ super().__init__(
54
+ fmt or "%(asctime)s [%(levelname)s] %(namespace)s: %(message)s", datefmt="%H:%M:%S"
55
+ )
56
+
57
+ def format(self, record: logging.LogRecord) -> str:
58
+ # Add namespace to extra fields if not present
59
+ if not getattr(record, "namespace", None):
60
+ record.__dict__["namespace"] = record.name
61
+
62
+ # Color the level name
63
+ record.levelname = Colors.apply(record.levelname, self.COLORS[record.levelno])
64
+
65
+ # Color the namespace
66
+ record.__dict__["namespace"] = Colors.apply(record.__dict__["namespace"], Colors.CYAN)
67
+
68
+ # Check if highlight flag is set
69
+ highlight = getattr(record, "highlight", None)
70
+ if highlight:
71
+ # Get the highlight color or default to magenta
72
+ color_name = highlight if isinstance(highlight, str) else "magenta"
73
+ highlight_color = self.HIGHLIGHT_COLORS.get(
74
+ color_name, self.HIGHLIGHT_COLORS["magenta"]
75
+ )
76
+
77
+ # Apply highlight to the message
78
+ original_message = record.getMessage()
79
+ record.msg = Colors.apply(original_message, highlight_color)
80
+ if record.args:
81
+ record.args = ()
82
+
83
+ return super().format(record)
84
+
85
+
86
+ class LoggerAdapter(logging.LoggerAdapter[logging.Logger]):
87
+ """
88
+ Logger adapter that allows highlighting specific log messages.
89
+ """
90
+
91
+ def process(
92
+ self, msg: Any, kwargs: MutableMapping[str, Any]
93
+ ) -> Tuple[Any, MutableMapping[str, Any]]:
94
+ # Pass highlight flag through to the record
95
+ return msg, kwargs
96
+
97
+ def highlight(
98
+ self,
99
+ msg: object,
100
+ *args: Any,
101
+ color: Literal["magenta", "cyan", "yellow", "red", "green"] = "magenta",
102
+ **kwargs: Any,
103
+ ) -> None:
104
+ """
105
+ Log a highlighted message.
106
+
107
+ Args:
108
+ msg: The message format string
109
+ color: The color to highlight with (magenta, cyan, yellow, red)
110
+ *args: The args for the message format string
111
+ **kwargs: Additional logging kwargs
112
+ """
113
+ kwargs.setdefault("extra", {})
114
+ if isinstance(kwargs["extra"], dict):
115
+ kwargs["extra"]["highlight"] = color
116
+ return self.info(msg, *args, **kwargs)
117
+
118
+
119
+ def get_logger(namespace: str) -> LoggerAdapter:
120
+ """
121
+ Get a colored logger for the specified namespace.
122
+
123
+ Args:
124
+ namespace: The namespace for the logger
125
+
126
+ Returns:
127
+ A configured logger instance with highlighting support
128
+ """
129
+ logger = logging.getLogger(namespace)
130
+
131
+ # Only add handler if it doesn't exist
132
+ if not logger.handlers:
133
+ handler = logging.StreamHandler(sys.stdout)
134
+ handler.setFormatter(ColoredFormatter())
135
+ logger.addHandler(handler)
136
+
137
+ # Set default level to INFO
138
+ logger.setLevel(logging.INFO)
139
+
140
+ # Wrap with adapter to support highlighting
141
+ return LoggerAdapter(logger, {})
@@ -0,0 +1,25 @@
1
+ from docent.data_models.agent_run import AgentRun
2
+ from docent.data_models.citation import Citation
3
+ from docent.data_models.filters import (
4
+ AgentRunIdFilter,
5
+ BaseFrameFilter,
6
+ ComplexFilter,
7
+ SearchResultPredicateFilter,
8
+ )
9
+ from docent.data_models.metadata import BaseAgentRunMetadata, BaseMetadata, FrameDimension
10
+ from docent.data_models.regex import RegexSnippet
11
+ from docent.data_models.transcript import Transcript
12
+
13
+ __all__ = [
14
+ "AgentRun",
15
+ "Citation",
16
+ "RegexSnippet",
17
+ "AgentRunIdFilter",
18
+ "FrameDimension",
19
+ "BaseFrameFilter",
20
+ "SearchResultPredicateFilter",
21
+ "ComplexFilter",
22
+ "BaseAgentRunMetadata",
23
+ "BaseMetadata",
24
+ "Transcript",
25
+ ]
@@ -0,0 +1,91 @@
1
+ import tiktoken
2
+
3
+ MAX_TOKENS = 100_000
4
+
5
+
6
+ def get_token_count(text: str, model: str = "gpt-4") -> int:
7
+ """Get the number of tokens in a text under the GPT-4 tokenization scheme."""
8
+ encoding = tiktoken.encoding_for_model(model)
9
+ return len(encoding.encode(text))
10
+
11
+
12
+ def truncate_to_token_limit(text: str, max_tokens: int, model: str = "gpt-4") -> str:
13
+ """Truncate text to stay within the specified token limit."""
14
+ encoding = tiktoken.encoding_for_model(model)
15
+ tokens = encoding.encode(text)
16
+
17
+ if len(tokens) <= max_tokens:
18
+ return text
19
+
20
+ return encoding.decode(tokens[:max_tokens])
21
+
22
+
23
+ class MessageRange:
24
+ """A range of messages in a transcript. start is inclusive, end is exclusive."""
25
+
26
+ start: int
27
+ end: int
28
+ include_metadata: bool
29
+ num_tokens: int
30
+
31
+ def __init__(self, start: int, end: int, include_metadata: bool, num_tokens: int):
32
+ self.start = start
33
+ self.end = end
34
+ self.include_metadata = include_metadata
35
+ self.num_tokens = num_tokens
36
+
37
+
38
+ def group_messages_into_ranges(
39
+ token_counts: list[int], metadata_tokens: int, max_tokens: int, margin: int = 50
40
+ ) -> list[MessageRange]:
41
+ """Split a list of messages + metadata into ranges that stay within the specified token limit.
42
+
43
+ Always tries to create ranges with metadata included, unless a single message + metadata is too long,
44
+ in which case you get a lone message with no metadata
45
+ """
46
+ ranges: list[MessageRange] = []
47
+ start_index = 0
48
+ running_token_count = 0
49
+
50
+ i = 0
51
+ while i < len(token_counts):
52
+ new_token_count = token_counts[i]
53
+ if running_token_count + new_token_count + metadata_tokens > max_tokens - margin:
54
+ if start_index == i: # a single message + metadata is already too long
55
+ ranges.append(
56
+ MessageRange(
57
+ start=i, end=i + 1, include_metadata=False, num_tokens=new_token_count
58
+ )
59
+ )
60
+ i += 1
61
+ else:
62
+ # add all messages from start_index to i-1, with metadata included
63
+ ranges.append(
64
+ MessageRange(
65
+ start=start_index,
66
+ end=i,
67
+ include_metadata=True,
68
+ num_tokens=running_token_count + metadata_tokens,
69
+ )
70
+ )
71
+ running_token_count = 0
72
+ start_index = i
73
+ else:
74
+ running_token_count += new_token_count
75
+ i += 1
76
+
77
+ if running_token_count > 0:
78
+ include_metadata = running_token_count + metadata_tokens < max_tokens - margin
79
+ num_tokens = (
80
+ running_token_count + metadata_tokens if include_metadata else running_token_count
81
+ )
82
+ ranges.append(
83
+ MessageRange(
84
+ start=start_index,
85
+ end=len(token_counts),
86
+ include_metadata=include_metadata,
87
+ num_tokens=num_tokens,
88
+ )
89
+ )
90
+
91
+ return ranges