hammad-python 0.0.10__py3-none-any.whl → 0.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hammad/__init__.py +64 -10
- hammad/based/__init__.py +52 -0
- hammad/based/fields.py +546 -0
- hammad/based/model.py +968 -0
- hammad/based/utils.py +455 -0
- hammad/cache/__init__.py +30 -0
- hammad/{cache.py → cache/_cache.py} +83 -12
- hammad/cli/__init__.py +25 -0
- hammad/cli/plugins/__init__.py +786 -0
- hammad/cli/styles/__init__.py +5 -0
- hammad/cli/styles/animations.py +548 -0
- hammad/cli/styles/settings.py +135 -0
- hammad/cli/styles/types.py +358 -0
- hammad/cli/styles/utils.py +480 -0
- hammad/data/__init__.py +51 -0
- hammad/data/collections/__init__.py +32 -0
- hammad/data/collections/base_collection.py +58 -0
- hammad/data/collections/collection.py +227 -0
- hammad/data/collections/searchable_collection.py +556 -0
- hammad/data/collections/vector_collection.py +497 -0
- hammad/data/databases/__init__.py +21 -0
- hammad/data/databases/database.py +551 -0
- hammad/data/types/__init__.py +33 -0
- hammad/data/types/files/__init__.py +1 -0
- hammad/data/types/files/audio.py +81 -0
- hammad/data/types/files/configuration.py +475 -0
- hammad/data/types/files/document.py +195 -0
- hammad/data/types/files/file.py +358 -0
- hammad/data/types/files/image.py +80 -0
- hammad/json/__init__.py +21 -0
- hammad/{utils/json → json}/converters.py +4 -1
- hammad/logging/__init__.py +27 -0
- hammad/logging/decorators.py +432 -0
- hammad/logging/logger.py +534 -0
- hammad/pydantic/__init__.py +43 -0
- hammad/{utils/pydantic → pydantic}/converters.py +2 -1
- hammad/pydantic/models/__init__.py +28 -0
- hammad/pydantic/models/arbitrary_model.py +46 -0
- hammad/pydantic/models/cacheable_model.py +79 -0
- hammad/pydantic/models/fast_model.py +318 -0
- hammad/pydantic/models/function_model.py +176 -0
- hammad/pydantic/models/subscriptable_model.py +63 -0
- hammad/text/__init__.py +37 -0
- hammad/text/text.py +1068 -0
- hammad/text/utils/__init__.py +1 -0
- hammad/{utils/text → text/utils}/converters.py +2 -2
- hammad/text/utils/markdown/__init__.py +1 -0
- hammad/{utils → text/utils}/markdown/converters.py +3 -3
- hammad/{utils → text/utils}/markdown/formatting.py +1 -1
- hammad/{utils/typing/utils.py → typing/__init__.py} +75 -2
- hammad/web/__init__.py +42 -0
- hammad/web/http/__init__.py +1 -0
- hammad/web/http/client.py +944 -0
- hammad/web/openapi/client.py +740 -0
- hammad/web/search/__init__.py +1 -0
- hammad/web/search/client.py +936 -0
- hammad/web/utils.py +463 -0
- hammad/yaml/__init__.py +30 -0
- hammad/yaml/converters.py +19 -0
- {hammad_python-0.0.10.dist-info → hammad_python-0.0.11.dist-info}/METADATA +14 -8
- hammad_python-0.0.11.dist-info/RECORD +65 -0
- hammad/database.py +0 -447
- hammad/logger.py +0 -273
- hammad/types/color.py +0 -951
- hammad/utils/json/__init__.py +0 -0
- hammad/utils/markdown/__init__.py +0 -0
- hammad/utils/pydantic/__init__.py +0 -0
- hammad/utils/text/__init__.py +0 -0
- hammad/utils/typing/__init__.py +0 -0
- hammad_python-0.0.10.dist-info/RECORD +0 -22
- /hammad/{types/__init__.py → py.typed} +0 -0
- /hammad/{utils → web/openapi}/__init__.py +0 -0
- {hammad_python-0.0.10.dist-info → hammad_python-0.0.11.dist-info}/WHEEL +0 -0
- {hammad_python-0.0.10.dist-info → hammad_python-0.0.11.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,475 @@
|
|
1
|
+
"""hammad.data.types.files.configuration"""
|
2
|
+
|
3
|
+
import os
|
4
|
+
import configparser
|
5
|
+
from pathlib import Path
|
6
|
+
from typing import Any, Self
|
7
|
+
from dotenv import load_dotenv, dotenv_values
|
8
|
+
import httpx
|
9
|
+
import msgspec
|
10
|
+
|
11
|
+
from .file import File, FileSource
|
12
|
+
from ....based.fields import basedfield
|
13
|
+
|
14
|
+
__all__ = ("Configuration",)
|
15
|
+
|
16
|
+
|
17
|
+
class Configuration(File):
|
18
|
+
"""Model / structure representation for configuration objects
|
19
|
+
for both module or application level usage. This class is
|
20
|
+
nothing more than a glorified key-value store with a
|
21
|
+
few extra features.
|
22
|
+
|
23
|
+
Inherits from File to provide file operations and extends
|
24
|
+
with configuration-specific functionality."""
|
25
|
+
|
26
|
+
# Configuration-specific fields
|
27
|
+
config_data: dict[str, Any] = basedfield(default_factory=dict)
|
28
|
+
"""The actual configuration key-value pairs."""
|
29
|
+
|
30
|
+
format_type: str | None = basedfield(default=None)
|
31
|
+
"""The format type of the configuration (json, toml, yaml, ini, env)."""
|
32
|
+
|
33
|
+
def __post_init__(self):
|
34
|
+
"""Initialize configuration data from file data if available."""
|
35
|
+
super().__post_init__()
|
36
|
+
|
37
|
+
# If we have data but no config_data, try to parse it
|
38
|
+
if self.data is not None and not self.config_data:
|
39
|
+
self._parse_data()
|
40
|
+
|
41
|
+
def _parse_data(self) -> None:
|
42
|
+
"""Parse the file data into configuration format."""
|
43
|
+
if not self.data:
|
44
|
+
return
|
45
|
+
|
46
|
+
content = self.data if isinstance(self.data, str) else self.data.decode("utf-8")
|
47
|
+
|
48
|
+
# Determine format from extension or type
|
49
|
+
format_type = self._detect_format()
|
50
|
+
|
51
|
+
try:
|
52
|
+
if format_type == "json":
|
53
|
+
self.config_data = msgspec.json.decode(content.encode("utf-8"))
|
54
|
+
elif format_type == "toml":
|
55
|
+
self.config_data = msgspec.toml.decode(content.encode("utf-8"))
|
56
|
+
elif format_type == "yaml":
|
57
|
+
self.config_data = msgspec.yaml.decode(content.encode("utf-8"))
|
58
|
+
elif format_type == "ini":
|
59
|
+
parser = configparser.ConfigParser()
|
60
|
+
parser.read_string(content)
|
61
|
+
self.config_data = {
|
62
|
+
section: dict(parser[section]) for section in parser.sections()
|
63
|
+
}
|
64
|
+
elif format_type == "env":
|
65
|
+
# Parse as dotenv format
|
66
|
+
lines = content.strip().split("\n")
|
67
|
+
config_data = {}
|
68
|
+
for line in lines:
|
69
|
+
line = line.strip()
|
70
|
+
if line and not line.startswith("#") and "=" in line:
|
71
|
+
key, value = line.split("=", 1)
|
72
|
+
config_data[key.strip()] = value.strip().strip("\"'")
|
73
|
+
self.config_data = config_data
|
74
|
+
|
75
|
+
self.format_type = format_type
|
76
|
+
except Exception as e:
|
77
|
+
raise ValueError(
|
78
|
+
f"Failed to parse configuration data as {format_type}: {e}"
|
79
|
+
)
|
80
|
+
|
81
|
+
def _detect_format(self) -> str:
|
82
|
+
"""Detect the configuration format from extension or content."""
|
83
|
+
if self.format_type:
|
84
|
+
return self.format_type
|
85
|
+
|
86
|
+
# Try to detect from file extension
|
87
|
+
# Get extension directly from source path to avoid caching issues
|
88
|
+
if self.source.path:
|
89
|
+
ext = self.source.path.suffix.lower()
|
90
|
+
if ext in [".json"]:
|
91
|
+
return "json"
|
92
|
+
elif ext in [".toml"]:
|
93
|
+
return "toml"
|
94
|
+
elif ext in [".yaml", ".yml"]:
|
95
|
+
return "yaml"
|
96
|
+
elif ext in [".ini", ".cfg", ".conf"]:
|
97
|
+
return "ini"
|
98
|
+
elif ext in [".env"]:
|
99
|
+
return "env"
|
100
|
+
elif self.extension:
|
101
|
+
ext = self.extension.lower()
|
102
|
+
if ext in [".json"]:
|
103
|
+
return "json"
|
104
|
+
elif ext in [".toml"]:
|
105
|
+
return "toml"
|
106
|
+
elif ext in [".yaml", ".yml"]:
|
107
|
+
return "yaml"
|
108
|
+
elif ext in [".ini", ".cfg", ".conf"]:
|
109
|
+
return "ini"
|
110
|
+
elif ext in [".env"]:
|
111
|
+
return "env"
|
112
|
+
|
113
|
+
# Try to detect from MIME type
|
114
|
+
if self.type:
|
115
|
+
if "json" in self.type:
|
116
|
+
return "json"
|
117
|
+
elif "yaml" in self.type:
|
118
|
+
return "yaml"
|
119
|
+
|
120
|
+
# Default fallback - try to parse as JSON first
|
121
|
+
return "json"
|
122
|
+
|
123
|
+
def _serialize_data(self, format_type: str | None = None) -> str:
|
124
|
+
"""Serialize configuration data to string format."""
|
125
|
+
format_type = format_type or self.format_type or "json"
|
126
|
+
|
127
|
+
if format_type == "json":
|
128
|
+
return msgspec.json.encode(self.config_data).decode("utf-8")
|
129
|
+
elif format_type == "toml":
|
130
|
+
return msgspec.toml.encode(self.config_data).decode("utf-8")
|
131
|
+
elif format_type == "yaml":
|
132
|
+
return msgspec.yaml.encode(self.config_data).decode("utf-8")
|
133
|
+
elif format_type == "ini":
|
134
|
+
parser = configparser.ConfigParser()
|
135
|
+
for section_name, section_data in self.config_data.items():
|
136
|
+
parser[section_name] = section_data
|
137
|
+
import io
|
138
|
+
|
139
|
+
output = io.StringIO()
|
140
|
+
parser.write(output)
|
141
|
+
return output.getvalue()
|
142
|
+
elif format_type == "env":
|
143
|
+
lines = []
|
144
|
+
for key, value in self.config_data.items():
|
145
|
+
# Simple escaping for shell variables
|
146
|
+
if isinstance(value, str) and (
|
147
|
+
" " in value or '"' in value or "'" in value
|
148
|
+
):
|
149
|
+
value = f'"{value}"'
|
150
|
+
lines.append(f"{key}={value}")
|
151
|
+
return "\n".join(lines)
|
152
|
+
else:
|
153
|
+
raise ValueError(f"Unsupported format: {format_type}")
|
154
|
+
|
155
|
+
@classmethod
|
156
|
+
def from_dotenv(cls, path: str | Path | None = None) -> Self:
|
157
|
+
"""Loads a .env file and creates a configuration object
|
158
|
+
from it.
|
159
|
+
|
160
|
+
NOTE: This does not set any environment variables.
|
161
|
+
|
162
|
+
Args:
|
163
|
+
path: The path to the .env file to load. If not provided,
|
164
|
+
the .env file in the current working directory will be used.
|
165
|
+
"""
|
166
|
+
if path is None:
|
167
|
+
path = Path.cwd() / ".env"
|
168
|
+
else:
|
169
|
+
path = Path(path)
|
170
|
+
|
171
|
+
if not path.exists():
|
172
|
+
raise FileNotFoundError(f"Environment file not found: {path}")
|
173
|
+
|
174
|
+
# Use dotenv_values to parse without setting environment variables
|
175
|
+
config_data = dotenv_values(path)
|
176
|
+
|
177
|
+
return cls(
|
178
|
+
config_data=dict(config_data),
|
179
|
+
format_type="env",
|
180
|
+
source=FileSource(
|
181
|
+
is_file=True,
|
182
|
+
path=path,
|
183
|
+
size=path.stat().st_size if path.exists() else None,
|
184
|
+
),
|
185
|
+
type="text/plain",
|
186
|
+
)
|
187
|
+
|
188
|
+
@classmethod
|
189
|
+
def from_os_prefix(cls, prefix: str) -> Self:
|
190
|
+
"""Creates a new configuration object using all variables
|
191
|
+
that begin with the given prefix.
|
192
|
+
|
193
|
+
Args:
|
194
|
+
prefix: The prefix to use to filter the variables.
|
195
|
+
"""
|
196
|
+
config_data = {}
|
197
|
+
for key, value in os.environ.items():
|
198
|
+
if key.startswith(prefix):
|
199
|
+
# Remove prefix and convert to lowercase
|
200
|
+
config_key = key[len(prefix) :].lstrip("_").lower()
|
201
|
+
config_data[config_key] = value
|
202
|
+
|
203
|
+
return cls(
|
204
|
+
config_data=config_data,
|
205
|
+
format_type="env",
|
206
|
+
source=FileSource(),
|
207
|
+
type="text/plain",
|
208
|
+
)
|
209
|
+
|
210
|
+
@classmethod
|
211
|
+
def from_os_vars(cls, vars: list[str]) -> Self:
|
212
|
+
"""Pulls a certain set of environment variables and
|
213
|
+
creates a configuration object from them.
|
214
|
+
|
215
|
+
Args:
|
216
|
+
vars: A list of environment variable names to pull.
|
217
|
+
"""
|
218
|
+
config_data = {}
|
219
|
+
for var in vars:
|
220
|
+
if var in os.environ:
|
221
|
+
config_data[var] = os.environ[var]
|
222
|
+
|
223
|
+
return cls(
|
224
|
+
config_data=config_data,
|
225
|
+
format_type="env",
|
226
|
+
source=FileSource(),
|
227
|
+
type="text/plain",
|
228
|
+
)
|
229
|
+
|
230
|
+
@classmethod
|
231
|
+
def from_file(
|
232
|
+
cls,
|
233
|
+
path: str | Path,
|
234
|
+
) -> Self:
|
235
|
+
"""Parses a file to return a configuration object. This
|
236
|
+
utilizes the following file types:
|
237
|
+
|
238
|
+
- json
|
239
|
+
- toml
|
240
|
+
- yaml
|
241
|
+
- ini
|
242
|
+
- env
|
243
|
+
"""
|
244
|
+
# Use the parent File class to load the file
|
245
|
+
file_obj = File.from_path(path, lazy=False)
|
246
|
+
|
247
|
+
# Create a Configuration object from the File object
|
248
|
+
config = cls(
|
249
|
+
data=file_obj.data,
|
250
|
+
type=file_obj.type,
|
251
|
+
source=file_obj.source,
|
252
|
+
)
|
253
|
+
|
254
|
+
# Parse the data
|
255
|
+
config._parse_data()
|
256
|
+
|
257
|
+
return config
|
258
|
+
|
259
|
+
@classmethod
|
260
|
+
def from_url(
|
261
|
+
cls,
|
262
|
+
url: str,
|
263
|
+
*,
|
264
|
+
timeout: float = 30.0,
|
265
|
+
headers: dict[str, str] | None = None,
|
266
|
+
) -> Self:
|
267
|
+
"""Load configuration from a URL supporting various formats.
|
268
|
+
|
269
|
+
Args:
|
270
|
+
url: The URL to load configuration from
|
271
|
+
timeout: Request timeout in seconds
|
272
|
+
headers: Optional HTTP headers to include in the request
|
273
|
+
|
274
|
+
Returns:
|
275
|
+
A new Configuration instance
|
276
|
+
"""
|
277
|
+
with httpx.Client(timeout=timeout) as client:
|
278
|
+
response = client.get(url, headers=headers or {})
|
279
|
+
response.raise_for_status()
|
280
|
+
|
281
|
+
# Get content as text
|
282
|
+
content = response.text
|
283
|
+
|
284
|
+
# Determine format from URL extension or content-type
|
285
|
+
format_type = None
|
286
|
+
if url.endswith(".json"):
|
287
|
+
format_type = "json"
|
288
|
+
elif url.endswith((".yaml", ".yml")):
|
289
|
+
format_type = "yaml"
|
290
|
+
elif url.endswith(".toml"):
|
291
|
+
format_type = "toml"
|
292
|
+
elif url.endswith((".ini", ".cfg", ".conf")):
|
293
|
+
format_type = "ini"
|
294
|
+
elif url.endswith(".env"):
|
295
|
+
format_type = "env"
|
296
|
+
else:
|
297
|
+
# Try to detect from content-type header
|
298
|
+
content_type = response.headers.get("content-type", "").lower()
|
299
|
+
if "json" in content_type:
|
300
|
+
format_type = "json"
|
301
|
+
elif "yaml" in content_type:
|
302
|
+
format_type = "yaml"
|
303
|
+
|
304
|
+
config = cls(
|
305
|
+
data=content,
|
306
|
+
type=response.headers.get("content-type"),
|
307
|
+
format_type=format_type,
|
308
|
+
source=FileSource(
|
309
|
+
is_url=True,
|
310
|
+
url=url,
|
311
|
+
size=len(content.encode("utf-8")),
|
312
|
+
encoding=response.encoding,
|
313
|
+
),
|
314
|
+
)
|
315
|
+
|
316
|
+
config._parse_data()
|
317
|
+
return config
|
318
|
+
|
319
|
+
def to_file(
|
320
|
+
self,
|
321
|
+
path: str | Path,
|
322
|
+
*,
|
323
|
+
overwrite: bool = False,
|
324
|
+
format_type: str | None = None,
|
325
|
+
) -> None:
|
326
|
+
"""Saves the configuration object to a file. This
|
327
|
+
utilizes the following file types:
|
328
|
+
|
329
|
+
- json
|
330
|
+
- toml
|
331
|
+
- yaml
|
332
|
+
- ini
|
333
|
+
- env
|
334
|
+
|
335
|
+
Args:
|
336
|
+
path: The path to the file to save the configuration to.
|
337
|
+
overwrite: Whether to overwrite the file if it already exists.
|
338
|
+
format_type: Override the format type for saving.
|
339
|
+
"""
|
340
|
+
save_path = Path(path)
|
341
|
+
|
342
|
+
if save_path.exists() and not overwrite:
|
343
|
+
raise FileExistsError(f"File already exists: {save_path}")
|
344
|
+
|
345
|
+
# Determine format from path extension if not specified
|
346
|
+
if format_type is None:
|
347
|
+
ext = save_path.suffix.lower()
|
348
|
+
if ext in [".json"]:
|
349
|
+
format_type = "json"
|
350
|
+
elif ext in [".toml"]:
|
351
|
+
format_type = "toml"
|
352
|
+
elif ext in [".yaml", ".yml"]:
|
353
|
+
format_type = "yaml"
|
354
|
+
elif ext in [".ini", ".cfg", ".conf"]:
|
355
|
+
format_type = "ini"
|
356
|
+
elif ext in [".env"]:
|
357
|
+
format_type = "env"
|
358
|
+
else:
|
359
|
+
format_type = self.format_type or "json"
|
360
|
+
|
361
|
+
# Serialize and save
|
362
|
+
content = self._serialize_data(format_type)
|
363
|
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
364
|
+
save_path.write_text(content, encoding="utf-8")
|
365
|
+
|
366
|
+
def update_file(
|
367
|
+
self,
|
368
|
+
path: str | Path,
|
369
|
+
exclude: list[str] | None = None,
|
370
|
+
exclude_none: bool = True,
|
371
|
+
) -> None:
|
372
|
+
"""Updates a valid configuration file with only the
|
373
|
+
differing values.
|
374
|
+
|
375
|
+
Args:
|
376
|
+
path: The path to the file to update.
|
377
|
+
exclude: A list of keys to exclude from the update.
|
378
|
+
exclude_none: Whether to exclude keys with None values.
|
379
|
+
"""
|
380
|
+
path = Path(path)
|
381
|
+
|
382
|
+
if not path.exists():
|
383
|
+
raise FileNotFoundError(f"Configuration file not found: {path}")
|
384
|
+
|
385
|
+
# Load existing configuration
|
386
|
+
existing_config = Configuration.from_file(path)
|
387
|
+
|
388
|
+
# Prepare data to update
|
389
|
+
update_data = self.config_data.copy()
|
390
|
+
|
391
|
+
if exclude:
|
392
|
+
for key in exclude:
|
393
|
+
update_data.pop(key, None)
|
394
|
+
|
395
|
+
if exclude_none:
|
396
|
+
update_data = {k: v for k, v in update_data.items() if v is not None}
|
397
|
+
|
398
|
+
# Merge with existing data
|
399
|
+
existing_config.config_data.update(update_data)
|
400
|
+
|
401
|
+
# Save back to file
|
402
|
+
existing_config.to_file(path, overwrite=True)
|
403
|
+
|
404
|
+
def to_os(
|
405
|
+
self,
|
406
|
+
prefix: str | None = None,
|
407
|
+
exclude: list[str] | None = None,
|
408
|
+
) -> None:
|
409
|
+
"""Pushes the configuration object's values as active
|
410
|
+
environment variables. This will overwrite any existing
|
411
|
+
values for the session.
|
412
|
+
|
413
|
+
Args:
|
414
|
+
prefix: The prefix to use to filter the variables.
|
415
|
+
exclude: A list of keys to exclude from the update.
|
416
|
+
"""
|
417
|
+
exclude = exclude or []
|
418
|
+
|
419
|
+
for key, value in self.config_data.items():
|
420
|
+
if key in exclude:
|
421
|
+
continue
|
422
|
+
|
423
|
+
# Convert value to string
|
424
|
+
env_value = str(value) if value is not None else ""
|
425
|
+
|
426
|
+
# Apply prefix if specified
|
427
|
+
env_key = f"{prefix}_{key}".upper() if prefix else key.upper()
|
428
|
+
|
429
|
+
# Set environment variable
|
430
|
+
os.environ[env_key] = env_value
|
431
|
+
|
432
|
+
def get(self, key: str, default: Any = None) -> Any:
|
433
|
+
"""Get a configuration value by key.
|
434
|
+
|
435
|
+
Args:
|
436
|
+
key: The configuration key
|
437
|
+
default: Default value if key is not found
|
438
|
+
|
439
|
+
Returns:
|
440
|
+
The configuration value or default
|
441
|
+
"""
|
442
|
+
return self.config_data.get(key, default)
|
443
|
+
|
444
|
+
def set(self, key: str, value: Any) -> None:
|
445
|
+
"""Set a configuration value.
|
446
|
+
|
447
|
+
Args:
|
448
|
+
key: The configuration key
|
449
|
+
value: The value to set
|
450
|
+
"""
|
451
|
+
self.config_data[key] = value
|
452
|
+
|
453
|
+
def __getitem__(self, key: str) -> Any:
|
454
|
+
"""Get configuration value using dict-like access."""
|
455
|
+
return self.config_data[key]
|
456
|
+
|
457
|
+
def __setitem__(self, key: str, value: Any) -> None:
|
458
|
+
"""Set configuration value using dict-like access."""
|
459
|
+
self.config_data[key] = value
|
460
|
+
|
461
|
+
def __contains__(self, key: str) -> bool:
|
462
|
+
"""Check if configuration contains a key."""
|
463
|
+
return key in self.config_data
|
464
|
+
|
465
|
+
def keys(self):
|
466
|
+
"""Return configuration keys."""
|
467
|
+
return self.config_data.keys()
|
468
|
+
|
469
|
+
def values(self):
|
470
|
+
"""Return configuration values."""
|
471
|
+
return self.config_data.values()
|
472
|
+
|
473
|
+
def items(self):
|
474
|
+
"""Return configuration key-value pairs."""
|
475
|
+
return self.config_data.items()
|
@@ -0,0 +1,195 @@
|
|
1
|
+
"""hammad.data.types.files.document"""
|
2
|
+
|
3
|
+
import httpx
|
4
|
+
from typing import Any, Self, Iterator
|
5
|
+
from markdown_it import MarkdownIt
|
6
|
+
|
7
|
+
from .file import File, FileSource
|
8
|
+
from ....based.fields import basedfield
|
9
|
+
|
10
|
+
__all__ = ("Document",)
|
11
|
+
|
12
|
+
|
13
|
+
class Document(File):
|
14
|
+
"""A representation of a document, that is loadable from both a URL, file path
|
15
|
+
or bytes. This document can additionally be used to represent web pages, as well
|
16
|
+
as implement markdown formatting for both documents and web pages."""
|
17
|
+
|
18
|
+
# Cached properties for text processing
|
19
|
+
_lines: list[str] | None = basedfield(default=None)
|
20
|
+
_content: str | None = basedfield(default=None)
|
21
|
+
_md_parser: MarkdownIt | None = basedfield(default=None)
|
22
|
+
metadata: dict[str, Any] = basedfield(default_factory=dict)
|
23
|
+
|
24
|
+
@property
|
25
|
+
def content(self) -> str:
|
26
|
+
"""Get the document content as string."""
|
27
|
+
if self._content is None:
|
28
|
+
data = self.read()
|
29
|
+
self._content = (
|
30
|
+
data
|
31
|
+
if isinstance(data, str)
|
32
|
+
else data.decode(self.source.encoding or "utf-8")
|
33
|
+
)
|
34
|
+
return self._content
|
35
|
+
|
36
|
+
@property
|
37
|
+
def lines(self) -> list[str]:
|
38
|
+
"""Get lines of the document (cached for efficiency)."""
|
39
|
+
if self._lines is None:
|
40
|
+
self._lines = self.content.splitlines(keepends=False)
|
41
|
+
return self._lines
|
42
|
+
|
43
|
+
@property
|
44
|
+
def line_count(self) -> int:
|
45
|
+
"""Get the number of lines in the document."""
|
46
|
+
return len(self.lines)
|
47
|
+
|
48
|
+
@property
|
49
|
+
def word_count(self) -> int:
|
50
|
+
"""Get the word count of the document."""
|
51
|
+
return len(self.content.split())
|
52
|
+
|
53
|
+
@property
|
54
|
+
def char_count(self) -> int:
|
55
|
+
"""Get the character count of the document."""
|
56
|
+
return len(self.content)
|
57
|
+
|
58
|
+
@property
|
59
|
+
def is_markdown(self) -> bool:
|
60
|
+
"""Check if the document is a markdown file."""
|
61
|
+
return self.extension in {".md", ".markdown", ".mdown", ".mkd", ".mdx"}
|
62
|
+
|
63
|
+
@property
|
64
|
+
def md_parser(self) -> MarkdownIt:
|
65
|
+
"""Get the markdown parser (lazy initialization)."""
|
66
|
+
if self._md_parser is None:
|
67
|
+
self._md_parser = MarkdownIt()
|
68
|
+
return self._md_parser
|
69
|
+
|
70
|
+
def iter_lines(self, *, strip: bool = False) -> Iterator[str]:
|
71
|
+
"""Iterate over lines in the document.
|
72
|
+
|
73
|
+
Args:
|
74
|
+
strip: If True, strip whitespace from each line.
|
75
|
+
|
76
|
+
Yields:
|
77
|
+
Lines from the document.
|
78
|
+
"""
|
79
|
+
for line in self.lines:
|
80
|
+
yield line.strip() if strip else line
|
81
|
+
|
82
|
+
def iter_paragraphs(self) -> Iterator[str]:
|
83
|
+
"""Iterate over paragraphs (text blocks separated by empty lines)."""
|
84
|
+
paragraph = []
|
85
|
+
for line in self.lines:
|
86
|
+
if line.strip():
|
87
|
+
paragraph.append(line)
|
88
|
+
elif paragraph:
|
89
|
+
yield "\n".join(paragraph)
|
90
|
+
paragraph = []
|
91
|
+
if paragraph:
|
92
|
+
yield "\n".join(paragraph)
|
93
|
+
|
94
|
+
def search(
|
95
|
+
self, pattern: str, *, case_sensitive: bool = False
|
96
|
+
) -> list[tuple[int, str]]:
|
97
|
+
"""Search for a pattern in the document.
|
98
|
+
|
99
|
+
Args:
|
100
|
+
pattern: The pattern to search for.
|
101
|
+
case_sensitive: If True, search is case-sensitive.
|
102
|
+
|
103
|
+
Returns:
|
104
|
+
List of tuples (line_number, line_content) for matching lines.
|
105
|
+
"""
|
106
|
+
results = []
|
107
|
+
search_pattern = pattern if case_sensitive else pattern.lower()
|
108
|
+
|
109
|
+
for i, line in enumerate(self.lines):
|
110
|
+
search_line = line if case_sensitive else line.lower()
|
111
|
+
if search_pattern in search_line:
|
112
|
+
results.append((i + 1, line)) # 1-indexed line numbers
|
113
|
+
|
114
|
+
return results
|
115
|
+
|
116
|
+
def render_markdown(self) -> str:
|
117
|
+
"""Render markdown content to HTML."""
|
118
|
+
if not self.is_markdown:
|
119
|
+
return self.content
|
120
|
+
return self.md_parser.render(self.content)
|
121
|
+
|
122
|
+
def extract_headers(self) -> list[tuple[int, str]]:
|
123
|
+
"""Extract headers from markdown documents.
|
124
|
+
|
125
|
+
Returns:
|
126
|
+
List of tuples (level, text) for each header.
|
127
|
+
"""
|
128
|
+
headers = []
|
129
|
+
if self.is_markdown:
|
130
|
+
tokens = self.md_parser.parse(self.content)
|
131
|
+
i = 0
|
132
|
+
while i < len(tokens):
|
133
|
+
if tokens[i].type == "heading_open":
|
134
|
+
level = int(tokens[i].tag[1]) # h1 -> 1, h2 -> 2, etc.
|
135
|
+
# Next token should be inline with the content
|
136
|
+
if i + 1 < len(tokens) and tokens[i + 1].type == "inline":
|
137
|
+
headers.append((level, tokens[i + 1].content))
|
138
|
+
i += 1
|
139
|
+
else:
|
140
|
+
# For non-markdown files, look for common header patterns
|
141
|
+
for line in self.lines:
|
142
|
+
stripped = line.strip()
|
143
|
+
if stripped.startswith("#"):
|
144
|
+
level = len(line) - len(line.lstrip("#"))
|
145
|
+
text = line.lstrip("#").strip()
|
146
|
+
headers.append((level, text))
|
147
|
+
return headers
|
148
|
+
|
149
|
+
@classmethod
|
150
|
+
def from_url(
|
151
|
+
cls,
|
152
|
+
url: str,
|
153
|
+
*,
|
154
|
+
lazy: bool = True,
|
155
|
+
timeout: float = 30.0,
|
156
|
+
) -> Self:
|
157
|
+
"""Download and create a document from a URL.
|
158
|
+
|
159
|
+
Args:
|
160
|
+
url: The URL to download from.
|
161
|
+
lazy: If True, defer loading content until needed.
|
162
|
+
timeout: Request timeout in seconds.
|
163
|
+
|
164
|
+
Returns:
|
165
|
+
A new Document instance.
|
166
|
+
"""
|
167
|
+
data = None
|
168
|
+
size = None
|
169
|
+
encoding = None
|
170
|
+
type = None
|
171
|
+
|
172
|
+
if not lazy:
|
173
|
+
with httpx.Client(timeout=timeout) as client:
|
174
|
+
response = client.get(url)
|
175
|
+
response.raise_for_status()
|
176
|
+
|
177
|
+
# Always get text for documents
|
178
|
+
data = response.text
|
179
|
+
size = len(data.encode("utf-8"))
|
180
|
+
encoding = response.encoding
|
181
|
+
|
182
|
+
# Get content type
|
183
|
+
content_type = response.headers.get("content-type", "")
|
184
|
+
type = content_type.split(";")[0] if content_type else "text/plain"
|
185
|
+
|
186
|
+
return cls(
|
187
|
+
data=data,
|
188
|
+
type=type,
|
189
|
+
source=FileSource(
|
190
|
+
is_url=True,
|
191
|
+
url=url,
|
192
|
+
size=size,
|
193
|
+
encoding=encoding,
|
194
|
+
),
|
195
|
+
)
|