hammad-python 0.0.10__py3-none-any.whl → 0.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. hammad/__init__.py +64 -10
  2. hammad/based/__init__.py +52 -0
  3. hammad/based/fields.py +546 -0
  4. hammad/based/model.py +968 -0
  5. hammad/based/utils.py +455 -0
  6. hammad/cache/__init__.py +30 -0
  7. hammad/{cache.py → cache/_cache.py} +83 -12
  8. hammad/cli/__init__.py +25 -0
  9. hammad/cli/plugins/__init__.py +786 -0
  10. hammad/cli/styles/__init__.py +5 -0
  11. hammad/cli/styles/animations.py +548 -0
  12. hammad/cli/styles/settings.py +135 -0
  13. hammad/cli/styles/types.py +358 -0
  14. hammad/cli/styles/utils.py +480 -0
  15. hammad/data/__init__.py +51 -0
  16. hammad/data/collections/__init__.py +32 -0
  17. hammad/data/collections/base_collection.py +58 -0
  18. hammad/data/collections/collection.py +227 -0
  19. hammad/data/collections/searchable_collection.py +556 -0
  20. hammad/data/collections/vector_collection.py +497 -0
  21. hammad/data/databases/__init__.py +21 -0
  22. hammad/data/databases/database.py +551 -0
  23. hammad/data/types/__init__.py +33 -0
  24. hammad/data/types/files/__init__.py +1 -0
  25. hammad/data/types/files/audio.py +81 -0
  26. hammad/data/types/files/configuration.py +475 -0
  27. hammad/data/types/files/document.py +195 -0
  28. hammad/data/types/files/file.py +358 -0
  29. hammad/data/types/files/image.py +80 -0
  30. hammad/json/__init__.py +21 -0
  31. hammad/{utils/json → json}/converters.py +4 -1
  32. hammad/logging/__init__.py +27 -0
  33. hammad/logging/decorators.py +432 -0
  34. hammad/logging/logger.py +534 -0
  35. hammad/pydantic/__init__.py +43 -0
  36. hammad/{utils/pydantic → pydantic}/converters.py +2 -1
  37. hammad/pydantic/models/__init__.py +28 -0
  38. hammad/pydantic/models/arbitrary_model.py +46 -0
  39. hammad/pydantic/models/cacheable_model.py +79 -0
  40. hammad/pydantic/models/fast_model.py +318 -0
  41. hammad/pydantic/models/function_model.py +176 -0
  42. hammad/pydantic/models/subscriptable_model.py +63 -0
  43. hammad/text/__init__.py +37 -0
  44. hammad/text/text.py +1068 -0
  45. hammad/text/utils/__init__.py +1 -0
  46. hammad/{utils/text → text/utils}/converters.py +2 -2
  47. hammad/text/utils/markdown/__init__.py +1 -0
  48. hammad/{utils → text/utils}/markdown/converters.py +3 -3
  49. hammad/{utils → text/utils}/markdown/formatting.py +1 -1
  50. hammad/{utils/typing/utils.py → typing/__init__.py} +75 -2
  51. hammad/web/__init__.py +42 -0
  52. hammad/web/http/__init__.py +1 -0
  53. hammad/web/http/client.py +944 -0
  54. hammad/web/openapi/client.py +740 -0
  55. hammad/web/search/__init__.py +1 -0
  56. hammad/web/search/client.py +936 -0
  57. hammad/web/utils.py +463 -0
  58. hammad/yaml/__init__.py +30 -0
  59. hammad/yaml/converters.py +19 -0
  60. {hammad_python-0.0.10.dist-info → hammad_python-0.0.11.dist-info}/METADATA +14 -8
  61. hammad_python-0.0.11.dist-info/RECORD +65 -0
  62. hammad/database.py +0 -447
  63. hammad/logger.py +0 -273
  64. hammad/types/color.py +0 -951
  65. hammad/utils/json/__init__.py +0 -0
  66. hammad/utils/markdown/__init__.py +0 -0
  67. hammad/utils/pydantic/__init__.py +0 -0
  68. hammad/utils/text/__init__.py +0 -0
  69. hammad/utils/typing/__init__.py +0 -0
  70. hammad_python-0.0.10.dist-info/RECORD +0 -22
  71. /hammad/{types/__init__.py → py.typed} +0 -0
  72. /hammad/{utils → web/openapi}/__init__.py +0 -0
  73. {hammad_python-0.0.10.dist-info → hammad_python-0.0.11.dist-info}/WHEEL +0 -0
  74. {hammad_python-0.0.10.dist-info → hammad_python-0.0.11.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,475 @@
1
+ """hammad.data.types.files.configuration"""
2
+
3
+ import os
4
+ import configparser
5
+ from pathlib import Path
6
+ from typing import Any, Self
7
+ from dotenv import load_dotenv, dotenv_values
8
+ import httpx
9
+ import msgspec
10
+
11
+ from .file import File, FileSource
12
+ from ....based.fields import basedfield
13
+
14
+ __all__ = ("Configuration",)
15
+
16
+
17
+ class Configuration(File):
18
+ """Model / structure representation for configuration objects
19
+ for both module or application level usage. This class is
20
+ nothing more than a glorified key-value store with a
21
+ few extra features.
22
+
23
+ Inherits from File to provide file operations and extends
24
+ with configuration-specific functionality."""
25
+
26
+ # Configuration-specific fields
27
+ config_data: dict[str, Any] = basedfield(default_factory=dict)
28
+ """The actual configuration key-value pairs."""
29
+
30
+ format_type: str | None = basedfield(default=None)
31
+ """The format type of the configuration (json, toml, yaml, ini, env)."""
32
+
33
+ def __post_init__(self):
34
+ """Initialize configuration data from file data if available."""
35
+ super().__post_init__()
36
+
37
+ # If we have data but no config_data, try to parse it
38
+ if self.data is not None and not self.config_data:
39
+ self._parse_data()
40
+
41
+ def _parse_data(self) -> None:
42
+ """Parse the file data into configuration format."""
43
+ if not self.data:
44
+ return
45
+
46
+ content = self.data if isinstance(self.data, str) else self.data.decode("utf-8")
47
+
48
+ # Determine format from extension or type
49
+ format_type = self._detect_format()
50
+
51
+ try:
52
+ if format_type == "json":
53
+ self.config_data = msgspec.json.decode(content.encode("utf-8"))
54
+ elif format_type == "toml":
55
+ self.config_data = msgspec.toml.decode(content.encode("utf-8"))
56
+ elif format_type == "yaml":
57
+ self.config_data = msgspec.yaml.decode(content.encode("utf-8"))
58
+ elif format_type == "ini":
59
+ parser = configparser.ConfigParser()
60
+ parser.read_string(content)
61
+ self.config_data = {
62
+ section: dict(parser[section]) for section in parser.sections()
63
+ }
64
+ elif format_type == "env":
65
+ # Parse as dotenv format
66
+ lines = content.strip().split("\n")
67
+ config_data = {}
68
+ for line in lines:
69
+ line = line.strip()
70
+ if line and not line.startswith("#") and "=" in line:
71
+ key, value = line.split("=", 1)
72
+ config_data[key.strip()] = value.strip().strip("\"'")
73
+ self.config_data = config_data
74
+
75
+ self.format_type = format_type
76
+ except Exception as e:
77
+ raise ValueError(
78
+ f"Failed to parse configuration data as {format_type}: {e}"
79
+ )
80
+
81
+ def _detect_format(self) -> str:
82
+ """Detect the configuration format from extension or content."""
83
+ if self.format_type:
84
+ return self.format_type
85
+
86
+ # Try to detect from file extension
87
+ # Get extension directly from source path to avoid caching issues
88
+ if self.source.path:
89
+ ext = self.source.path.suffix.lower()
90
+ if ext in [".json"]:
91
+ return "json"
92
+ elif ext in [".toml"]:
93
+ return "toml"
94
+ elif ext in [".yaml", ".yml"]:
95
+ return "yaml"
96
+ elif ext in [".ini", ".cfg", ".conf"]:
97
+ return "ini"
98
+ elif ext in [".env"]:
99
+ return "env"
100
+ elif self.extension:
101
+ ext = self.extension.lower()
102
+ if ext in [".json"]:
103
+ return "json"
104
+ elif ext in [".toml"]:
105
+ return "toml"
106
+ elif ext in [".yaml", ".yml"]:
107
+ return "yaml"
108
+ elif ext in [".ini", ".cfg", ".conf"]:
109
+ return "ini"
110
+ elif ext in [".env"]:
111
+ return "env"
112
+
113
+ # Try to detect from MIME type
114
+ if self.type:
115
+ if "json" in self.type:
116
+ return "json"
117
+ elif "yaml" in self.type:
118
+ return "yaml"
119
+
120
+ # Default fallback - try to parse as JSON first
121
+ return "json"
122
+
123
+ def _serialize_data(self, format_type: str | None = None) -> str:
124
+ """Serialize configuration data to string format."""
125
+ format_type = format_type or self.format_type or "json"
126
+
127
+ if format_type == "json":
128
+ return msgspec.json.encode(self.config_data).decode("utf-8")
129
+ elif format_type == "toml":
130
+ return msgspec.toml.encode(self.config_data).decode("utf-8")
131
+ elif format_type == "yaml":
132
+ return msgspec.yaml.encode(self.config_data).decode("utf-8")
133
+ elif format_type == "ini":
134
+ parser = configparser.ConfigParser()
135
+ for section_name, section_data in self.config_data.items():
136
+ parser[section_name] = section_data
137
+ import io
138
+
139
+ output = io.StringIO()
140
+ parser.write(output)
141
+ return output.getvalue()
142
+ elif format_type == "env":
143
+ lines = []
144
+ for key, value in self.config_data.items():
145
+ # Simple escaping for shell variables
146
+ if isinstance(value, str) and (
147
+ " " in value or '"' in value or "'" in value
148
+ ):
149
+ value = f'"{value}"'
150
+ lines.append(f"{key}={value}")
151
+ return "\n".join(lines)
152
+ else:
153
+ raise ValueError(f"Unsupported format: {format_type}")
154
+
155
+ @classmethod
156
+ def from_dotenv(cls, path: str | Path | None = None) -> Self:
157
+ """Loads a .env file and creates a configuration object
158
+ from it.
159
+
160
+ NOTE: This does not set any environment variables.
161
+
162
+ Args:
163
+ path: The path to the .env file to load. If not provided,
164
+ the .env file in the current working directory will be used.
165
+ """
166
+ if path is None:
167
+ path = Path.cwd() / ".env"
168
+ else:
169
+ path = Path(path)
170
+
171
+ if not path.exists():
172
+ raise FileNotFoundError(f"Environment file not found: {path}")
173
+
174
+ # Use dotenv_values to parse without setting environment variables
175
+ config_data = dotenv_values(path)
176
+
177
+ return cls(
178
+ config_data=dict(config_data),
179
+ format_type="env",
180
+ source=FileSource(
181
+ is_file=True,
182
+ path=path,
183
+ size=path.stat().st_size if path.exists() else None,
184
+ ),
185
+ type="text/plain",
186
+ )
187
+
188
+ @classmethod
189
+ def from_os_prefix(cls, prefix: str) -> Self:
190
+ """Creates a new configuration object using all variables
191
+ that begin with the given prefix.
192
+
193
+ Args:
194
+ prefix: The prefix to use to filter the variables.
195
+ """
196
+ config_data = {}
197
+ for key, value in os.environ.items():
198
+ if key.startswith(prefix):
199
+ # Remove prefix and convert to lowercase
200
+ config_key = key[len(prefix) :].lstrip("_").lower()
201
+ config_data[config_key] = value
202
+
203
+ return cls(
204
+ config_data=config_data,
205
+ format_type="env",
206
+ source=FileSource(),
207
+ type="text/plain",
208
+ )
209
+
210
+ @classmethod
211
+ def from_os_vars(cls, vars: list[str]) -> Self:
212
+ """Pulls a certain set of environment variables and
213
+ creates a configuration object from them.
214
+
215
+ Args:
216
+ vars: A list of environment variable names to pull.
217
+ """
218
+ config_data = {}
219
+ for var in vars:
220
+ if var in os.environ:
221
+ config_data[var] = os.environ[var]
222
+
223
+ return cls(
224
+ config_data=config_data,
225
+ format_type="env",
226
+ source=FileSource(),
227
+ type="text/plain",
228
+ )
229
+
230
+ @classmethod
231
+ def from_file(
232
+ cls,
233
+ path: str | Path,
234
+ ) -> Self:
235
+ """Parses a file to return a configuration object. This
236
+ utilizes the following file types:
237
+
238
+ - json
239
+ - toml
240
+ - yaml
241
+ - ini
242
+ - env
243
+ """
244
+ # Use the parent File class to load the file
245
+ file_obj = File.from_path(path, lazy=False)
246
+
247
+ # Create a Configuration object from the File object
248
+ config = cls(
249
+ data=file_obj.data,
250
+ type=file_obj.type,
251
+ source=file_obj.source,
252
+ )
253
+
254
+ # Parse the data
255
+ config._parse_data()
256
+
257
+ return config
258
+
259
+ @classmethod
260
+ def from_url(
261
+ cls,
262
+ url: str,
263
+ *,
264
+ timeout: float = 30.0,
265
+ headers: dict[str, str] | None = None,
266
+ ) -> Self:
267
+ """Load configuration from a URL supporting various formats.
268
+
269
+ Args:
270
+ url: The URL to load configuration from
271
+ timeout: Request timeout in seconds
272
+ headers: Optional HTTP headers to include in the request
273
+
274
+ Returns:
275
+ A new Configuration instance
276
+ """
277
+ with httpx.Client(timeout=timeout) as client:
278
+ response = client.get(url, headers=headers or {})
279
+ response.raise_for_status()
280
+
281
+ # Get content as text
282
+ content = response.text
283
+
284
+ # Determine format from URL extension or content-type
285
+ format_type = None
286
+ if url.endswith(".json"):
287
+ format_type = "json"
288
+ elif url.endswith((".yaml", ".yml")):
289
+ format_type = "yaml"
290
+ elif url.endswith(".toml"):
291
+ format_type = "toml"
292
+ elif url.endswith((".ini", ".cfg", ".conf")):
293
+ format_type = "ini"
294
+ elif url.endswith(".env"):
295
+ format_type = "env"
296
+ else:
297
+ # Try to detect from content-type header
298
+ content_type = response.headers.get("content-type", "").lower()
299
+ if "json" in content_type:
300
+ format_type = "json"
301
+ elif "yaml" in content_type:
302
+ format_type = "yaml"
303
+
304
+ config = cls(
305
+ data=content,
306
+ type=response.headers.get("content-type"),
307
+ format_type=format_type,
308
+ source=FileSource(
309
+ is_url=True,
310
+ url=url,
311
+ size=len(content.encode("utf-8")),
312
+ encoding=response.encoding,
313
+ ),
314
+ )
315
+
316
+ config._parse_data()
317
+ return config
318
+
319
+ def to_file(
320
+ self,
321
+ path: str | Path,
322
+ *,
323
+ overwrite: bool = False,
324
+ format_type: str | None = None,
325
+ ) -> None:
326
+ """Saves the configuration object to a file. This
327
+ utilizes the following file types:
328
+
329
+ - json
330
+ - toml
331
+ - yaml
332
+ - ini
333
+ - env
334
+
335
+ Args:
336
+ path: The path to the file to save the configuration to.
337
+ overwrite: Whether to overwrite the file if it already exists.
338
+ format_type: Override the format type for saving.
339
+ """
340
+ save_path = Path(path)
341
+
342
+ if save_path.exists() and not overwrite:
343
+ raise FileExistsError(f"File already exists: {save_path}")
344
+
345
+ # Determine format from path extension if not specified
346
+ if format_type is None:
347
+ ext = save_path.suffix.lower()
348
+ if ext in [".json"]:
349
+ format_type = "json"
350
+ elif ext in [".toml"]:
351
+ format_type = "toml"
352
+ elif ext in [".yaml", ".yml"]:
353
+ format_type = "yaml"
354
+ elif ext in [".ini", ".cfg", ".conf"]:
355
+ format_type = "ini"
356
+ elif ext in [".env"]:
357
+ format_type = "env"
358
+ else:
359
+ format_type = self.format_type or "json"
360
+
361
+ # Serialize and save
362
+ content = self._serialize_data(format_type)
363
+ save_path.parent.mkdir(parents=True, exist_ok=True)
364
+ save_path.write_text(content, encoding="utf-8")
365
+
366
+ def update_file(
367
+ self,
368
+ path: str | Path,
369
+ exclude: list[str] | None = None,
370
+ exclude_none: bool = True,
371
+ ) -> None:
372
+ """Updates a valid configuration file with only the
373
+ differing values.
374
+
375
+ Args:
376
+ path: The path to the file to update.
377
+ exclude: A list of keys to exclude from the update.
378
+ exclude_none: Whether to exclude keys with None values.
379
+ """
380
+ path = Path(path)
381
+
382
+ if not path.exists():
383
+ raise FileNotFoundError(f"Configuration file not found: {path}")
384
+
385
+ # Load existing configuration
386
+ existing_config = Configuration.from_file(path)
387
+
388
+ # Prepare data to update
389
+ update_data = self.config_data.copy()
390
+
391
+ if exclude:
392
+ for key in exclude:
393
+ update_data.pop(key, None)
394
+
395
+ if exclude_none:
396
+ update_data = {k: v for k, v in update_data.items() if v is not None}
397
+
398
+ # Merge with existing data
399
+ existing_config.config_data.update(update_data)
400
+
401
+ # Save back to file
402
+ existing_config.to_file(path, overwrite=True)
403
+
404
+ def to_os(
405
+ self,
406
+ prefix: str | None = None,
407
+ exclude: list[str] | None = None,
408
+ ) -> None:
409
+ """Pushes the configuration object's values as active
410
+ environment variables. This will overwrite any existing
411
+ values for the session.
412
+
413
+ Args:
414
+ prefix: The prefix to use to filter the variables.
415
+ exclude: A list of keys to exclude from the update.
416
+ """
417
+ exclude = exclude or []
418
+
419
+ for key, value in self.config_data.items():
420
+ if key in exclude:
421
+ continue
422
+
423
+ # Convert value to string
424
+ env_value = str(value) if value is not None else ""
425
+
426
+ # Apply prefix if specified
427
+ env_key = f"{prefix}_{key}".upper() if prefix else key.upper()
428
+
429
+ # Set environment variable
430
+ os.environ[env_key] = env_value
431
+
432
+ def get(self, key: str, default: Any = None) -> Any:
433
+ """Get a configuration value by key.
434
+
435
+ Args:
436
+ key: The configuration key
437
+ default: Default value if key is not found
438
+
439
+ Returns:
440
+ The configuration value or default
441
+ """
442
+ return self.config_data.get(key, default)
443
+
444
+ def set(self, key: str, value: Any) -> None:
445
+ """Set a configuration value.
446
+
447
+ Args:
448
+ key: The configuration key
449
+ value: The value to set
450
+ """
451
+ self.config_data[key] = value
452
+
453
+ def __getitem__(self, key: str) -> Any:
454
+ """Get configuration value using dict-like access."""
455
+ return self.config_data[key]
456
+
457
+ def __setitem__(self, key: str, value: Any) -> None:
458
+ """Set configuration value using dict-like access."""
459
+ self.config_data[key] = value
460
+
461
+ def __contains__(self, key: str) -> bool:
462
+ """Check if configuration contains a key."""
463
+ return key in self.config_data
464
+
465
+ def keys(self):
466
+ """Return configuration keys."""
467
+ return self.config_data.keys()
468
+
469
+ def values(self):
470
+ """Return configuration values."""
471
+ return self.config_data.values()
472
+
473
+ def items(self):
474
+ """Return configuration key-value pairs."""
475
+ return self.config_data.items()
@@ -0,0 +1,195 @@
1
+ """hammad.data.types.files.document"""
2
+
3
+ import httpx
4
+ from typing import Any, Self, Iterator
5
+ from markdown_it import MarkdownIt
6
+
7
+ from .file import File, FileSource
8
+ from ....based.fields import basedfield
9
+
10
+ __all__ = ("Document",)
11
+
12
+
13
+ class Document(File):
14
+ """A representation of a document, that is loadable from both a URL, file path
15
+ or bytes. This document can additionally be used to represent web pages, as well
16
+ as implement markdown formatting for both documents and web pages."""
17
+
18
+ # Cached properties for text processing
19
+ _lines: list[str] | None = basedfield(default=None)
20
+ _content: str | None = basedfield(default=None)
21
+ _md_parser: MarkdownIt | None = basedfield(default=None)
22
+ metadata: dict[str, Any] = basedfield(default_factory=dict)
23
+
24
+ @property
25
+ def content(self) -> str:
26
+ """Get the document content as string."""
27
+ if self._content is None:
28
+ data = self.read()
29
+ self._content = (
30
+ data
31
+ if isinstance(data, str)
32
+ else data.decode(self.source.encoding or "utf-8")
33
+ )
34
+ return self._content
35
+
36
+ @property
37
+ def lines(self) -> list[str]:
38
+ """Get lines of the document (cached for efficiency)."""
39
+ if self._lines is None:
40
+ self._lines = self.content.splitlines(keepends=False)
41
+ return self._lines
42
+
43
+ @property
44
+ def line_count(self) -> int:
45
+ """Get the number of lines in the document."""
46
+ return len(self.lines)
47
+
48
+ @property
49
+ def word_count(self) -> int:
50
+ """Get the word count of the document."""
51
+ return len(self.content.split())
52
+
53
+ @property
54
+ def char_count(self) -> int:
55
+ """Get the character count of the document."""
56
+ return len(self.content)
57
+
58
+ @property
59
+ def is_markdown(self) -> bool:
60
+ """Check if the document is a markdown file."""
61
+ return self.extension in {".md", ".markdown", ".mdown", ".mkd", ".mdx"}
62
+
63
+ @property
64
+ def md_parser(self) -> MarkdownIt:
65
+ """Get the markdown parser (lazy initialization)."""
66
+ if self._md_parser is None:
67
+ self._md_parser = MarkdownIt()
68
+ return self._md_parser
69
+
70
+ def iter_lines(self, *, strip: bool = False) -> Iterator[str]:
71
+ """Iterate over lines in the document.
72
+
73
+ Args:
74
+ strip: If True, strip whitespace from each line.
75
+
76
+ Yields:
77
+ Lines from the document.
78
+ """
79
+ for line in self.lines:
80
+ yield line.strip() if strip else line
81
+
82
+ def iter_paragraphs(self) -> Iterator[str]:
83
+ """Iterate over paragraphs (text blocks separated by empty lines)."""
84
+ paragraph = []
85
+ for line in self.lines:
86
+ if line.strip():
87
+ paragraph.append(line)
88
+ elif paragraph:
89
+ yield "\n".join(paragraph)
90
+ paragraph = []
91
+ if paragraph:
92
+ yield "\n".join(paragraph)
93
+
94
+ def search(
95
+ self, pattern: str, *, case_sensitive: bool = False
96
+ ) -> list[tuple[int, str]]:
97
+ """Search for a pattern in the document.
98
+
99
+ Args:
100
+ pattern: The pattern to search for.
101
+ case_sensitive: If True, search is case-sensitive.
102
+
103
+ Returns:
104
+ List of tuples (line_number, line_content) for matching lines.
105
+ """
106
+ results = []
107
+ search_pattern = pattern if case_sensitive else pattern.lower()
108
+
109
+ for i, line in enumerate(self.lines):
110
+ search_line = line if case_sensitive else line.lower()
111
+ if search_pattern in search_line:
112
+ results.append((i + 1, line)) # 1-indexed line numbers
113
+
114
+ return results
115
+
116
+ def render_markdown(self) -> str:
117
+ """Render markdown content to HTML."""
118
+ if not self.is_markdown:
119
+ return self.content
120
+ return self.md_parser.render(self.content)
121
+
122
+ def extract_headers(self) -> list[tuple[int, str]]:
123
+ """Extract headers from markdown documents.
124
+
125
+ Returns:
126
+ List of tuples (level, text) for each header.
127
+ """
128
+ headers = []
129
+ if self.is_markdown:
130
+ tokens = self.md_parser.parse(self.content)
131
+ i = 0
132
+ while i < len(tokens):
133
+ if tokens[i].type == "heading_open":
134
+ level = int(tokens[i].tag[1]) # h1 -> 1, h2 -> 2, etc.
135
+ # Next token should be inline with the content
136
+ if i + 1 < len(tokens) and tokens[i + 1].type == "inline":
137
+ headers.append((level, tokens[i + 1].content))
138
+ i += 1
139
+ else:
140
+ # For non-markdown files, look for common header patterns
141
+ for line in self.lines:
142
+ stripped = line.strip()
143
+ if stripped.startswith("#"):
144
+ level = len(line) - len(line.lstrip("#"))
145
+ text = line.lstrip("#").strip()
146
+ headers.append((level, text))
147
+ return headers
148
+
149
+ @classmethod
150
+ def from_url(
151
+ cls,
152
+ url: str,
153
+ *,
154
+ lazy: bool = True,
155
+ timeout: float = 30.0,
156
+ ) -> Self:
157
+ """Download and create a document from a URL.
158
+
159
+ Args:
160
+ url: The URL to download from.
161
+ lazy: If True, defer loading content until needed.
162
+ timeout: Request timeout in seconds.
163
+
164
+ Returns:
165
+ A new Document instance.
166
+ """
167
+ data = None
168
+ size = None
169
+ encoding = None
170
+ type = None
171
+
172
+ if not lazy:
173
+ with httpx.Client(timeout=timeout) as client:
174
+ response = client.get(url)
175
+ response.raise_for_status()
176
+
177
+ # Always get text for documents
178
+ data = response.text
179
+ size = len(data.encode("utf-8"))
180
+ encoding = response.encoding
181
+
182
+ # Get content type
183
+ content_type = response.headers.get("content-type", "")
184
+ type = content_type.split(";")[0] if content_type else "text/plain"
185
+
186
+ return cls(
187
+ data=data,
188
+ type=type,
189
+ source=FileSource(
190
+ is_url=True,
191
+ url=url,
192
+ size=size,
193
+ encoding=encoding,
194
+ ),
195
+ )