thunderdots 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,12 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from importlib.metadata import PackageNotFoundError, version
4
+
5
+ from .client import ThunderDots
6
+
7
+ try:
8
+ __version__ = version("thunderdots")
9
+ except PackageNotFoundError:
10
+ __version__ = "0.0.0"
11
+
12
+ __all__ = ["ThunderDots", "__version__"]
@@ -0,0 +1,24 @@
1
+ # file generated by vcs-versioning
2
+ # don't change, don't track in version control
3
+ from __future__ import annotations
4
+
5
+ __all__ = [
6
+ "__version__",
7
+ "__version_tuple__",
8
+ "version",
9
+ "version_tuple",
10
+ "__commit_id__",
11
+ "commit_id",
12
+ ]
13
+
14
+ version: str
15
+ __version__: str
16
+ __version_tuple__: tuple[int | str, ...]
17
+ version_tuple: tuple[int | str, ...]
18
+ commit_id: str | None
19
+ __commit_id__: str | None
20
+
21
+ __version__ = version = '0.1.0'
22
+ __version_tuple__ = version_tuple = (0, 1, 0)
23
+
24
+ __commit_id__ = commit_id = None
thunderdots/client.py ADDED
@@ -0,0 +1,559 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ """client.py
4
+
5
+ ThunderDots client interface (single Python pipeline, optional Go fetcher).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import concurrent.futures
11
+ import threading
12
+ import asyncio
13
+ import csv
14
+ import json
15
+ from pathlib import Path
16
+ from typing import Any
17
+
18
+ from .ui import UI
19
+ from .stats import Stats
20
+ from .config import ThunderDotsConfig, CollectionParams, ResourceParams
21
+ from .fetcher import HttpxFetcher, Fetcher
22
+ from .extract.walker import walk_collections
23
+ from .extract.resources import fetch_resources
24
+ from .normalize.output import build_output
25
+ from .normalize.metadata import canonicalize_metadata_keys
26
+ from .validation import validate_notice, validate_many
27
+ from .orm import DotsNotice
28
+ from importlib.metadata import PackageNotFoundError, version
29
+
30
+
31
+ def _package_version() -> str:
32
+ """Return the installed ThunderDots package version.
33
+
34
+ :return: Installed package version, or ``"0.0.0"`` when package metadata is unavailable.
35
+ :rtype: str
36
+ """
37
+ try:
38
+ return version("thunderdots")
39
+ except PackageNotFoundError:
40
+ return "0.0.0"
41
+
42
+
43
+ def _run_coro_in_thread(coro_factory: callable[[], Any]) -> Any:
44
+ """Run an async coroutine in a separate thread and return its result, allowing async code to run in sync contexts (e.g. notebooks).
45
+
46
+ :param coro_factory: A callable that returns the coroutine to run (e.g. a lambda that calls an async method).
47
+ :type coro_factory: callable[[], Any]
48
+ :return: The result of the coroutine once it has completed.
49
+ :rtype: Any
50
+ """
51
+ result: dict[str, Any] = {}
52
+ error: dict[str, BaseException] = {}
53
+
54
+ def runner() -> None:
55
+ """Run the coroutine and store the result or any exception that occurs."""
56
+ try:
57
+ result["value"] = asyncio.run(coro_factory())
58
+ except BaseException as exc:
59
+ error["value"] = exc
60
+
61
+ thread = threading.Thread(target=runner, daemon=True)
62
+ thread.start()
63
+ thread.join()
64
+
65
+ if "value" in error:
66
+ raise error["value"]
67
+
68
+ return result.get("value")
69
+
70
+
71
+ def _flatten_for_csv(data: dict[str, Any], prefix: str = "") -> dict[str, str]:
72
+ """Flatten a nested dictionary into a flat dictionary with dot-separated keys, suitable for CSV output. Lists are JSON-encoded, and None values become empty strings.
73
+
74
+ :param data: The input dictionary to flatten, which may contain nested dictionaries and lists.
75
+ :type data: dict[str, Any]
76
+ :param prefix: The prefix to prepend to keys (used for recursion, default is empty
77
+ string).
78
+ :type prefix: str, optional
79
+ :return: A flat dictionary with dot-separated keys and string values, suitable for CSV output
80
+ :rtype: dict[str, str]
81
+ """
82
+ out: dict[str, str] = {}
83
+
84
+ for key, value in data.items():
85
+ full_key = f"{prefix}.{key}" if prefix else key
86
+
87
+ if isinstance(value, dict):
88
+ out.update(_flatten_for_csv(value, full_key))
89
+ elif isinstance(value, list):
90
+ out[full_key] = json.dumps(value, ensure_ascii=False)
91
+ elif value is None:
92
+ out[full_key] = ""
93
+ else:
94
+ out[full_key] = str(value)
95
+
96
+ return out
97
+
98
+
99
+ class ThunderDots:
100
+ """Client class for fetching and processing data from a DTS endpoint, with support for configuration, caching, and result output."""
101
+
102
+ def __init__(
103
+ self,
104
+ endpoint_dts: str,
105
+ fetch_collection_metadata: bool = True,
106
+ fetch_resource_metadata: bool = True,
107
+ collection_params: dict[str, Any] | None = None,
108
+ resource_params: dict[str, Any] | None = None,
109
+ validate: bool = False,
110
+ validation_profile: str = "dts",
111
+ verbose: bool = True,
112
+ concurrency: int = 20,
113
+ timeout: float = 30.0,
114
+ request_timeout: float = 20.0,
115
+ retries: int = 2,
116
+ backoff_ms: int = 200,
117
+ output_path: str | None = None,
118
+ cache_csv_path: str | None = None,
119
+ use_cache: bool = True,
120
+ ) -> None:
121
+ """Initialize ThunderDots client with configuration parameters.
122
+
123
+ :param endpoint_dts: Base URL of the DTS endpoint (e.g. "https://example.com/dts").
124
+ :type endpoint_dts: str
125
+ :param fetch_collection_metadata: Whether to fetch metadata for collections (default: True).
126
+ :type fetch_collection_metadata: bool
127
+ :param fetch_resource_metadata: Whether to fetch metadata for resources (default: True).
128
+ :type fetch_resource_metadata: bool
129
+ :param collection_params: Dictionary of parameters for collection fetching/filtering (default: None).
130
+ :type collection_params: dict[str, Any] | None
131
+ :param resource_params: Dictionary of parameters for resource fetching/filtering (default: None).
132
+ :type resource_params: dict[str, Any] | None
133
+ :param verbose: Whether to enable verbose logging and UI (default: True).
134
+ :type verbose: bool
135
+ :param concurrency: Number of concurrent fetches for Python fetcher (default: 20
136
+ :type concurrency: int
137
+ :param timeout: Request timeout in seconds for Python fetcher (default: 30.
138
+ :type timeout: float
139
+ :param retries: Number of retries for failed requests (default: 2).
140
+ :type retries: int
141
+ :param backoff_ms: Base backoff in milliseconds for retries (default: 200
142
+ ms).
143
+ :type backoff_ms: int
144
+ :param output_path: Optional path to save the final output JSON (e.g. "
145
+ output/results.json"), if not provided results will not be saved to disk (default: None).
146
+ :type output_path: str | None
147
+ :raises ValueError: If endpoint_dts is not provided or if fetcher is not "python" or "go".
148
+ """
149
+
150
+ endpoint = (endpoint_dts or "").rstrip("/")
151
+
152
+ if not endpoint:
153
+ raise ValueError("endpoint_dts is required")
154
+
155
+ self.config = ThunderDotsConfig(
156
+ endpoint_dts=endpoint,
157
+ fetch_collection_metadata=fetch_collection_metadata,
158
+ fetch_resource_metadata=fetch_resource_metadata,
159
+ validate=validate,
160
+ validation_profile=validation_profile,
161
+ collection_params=CollectionParams.from_dict(collection_params),
162
+ resource_params=ResourceParams.from_dict(resource_params),
163
+ verbose=verbose,
164
+ concurrency=int(concurrency),
165
+ timeout=float(timeout),
166
+ request_timeout=float(request_timeout),
167
+ retries=int(retries),
168
+ backoff_ms=int(backoff_ms),
169
+ output_path=output_path,
170
+ cache_csv_path=cache_csv_path,
171
+ use_cache=bool(use_cache),
172
+ )
173
+
174
+ self._stats = Stats()
175
+
176
+ self._results: dict[str, Any] | None = None
177
+
178
+ self._executor: concurrent.futures.ThreadPoolExecutor | None = None
179
+
180
+ # ---------------- PUBLIC API ---------------- #
181
+
182
+ def _validate_results_if_needed(self) -> None:
183
+ """Validate current results when validation is enabled.
184
+
185
+ This method enriches the current result dictionary with validation
186
+ reports. It is used both after a fresh fetch and after loading results
187
+ from cache.
188
+
189
+ :return: None.
190
+ :rtype: None
191
+ """
192
+ if not self.config.validate:
193
+ return
194
+
195
+ if not isinstance(self._results, dict):
196
+ return
197
+
198
+ output_report = validate_notice(self._results, profile="output")
199
+
200
+ resource_report = validate_many(
201
+ self._results.get("resource_results", []),
202
+ profile="resource_result",
203
+ )
204
+
205
+ self._results["validation"] = {
206
+ "output": output_report.to_dict(),
207
+ "resources": resource_report.summary(),
208
+ }
209
+
210
+ async def afetch(self) -> None:
211
+ """Fetch data asynchronously from the DTS endpoint.
212
+
213
+ If cache loading is enabled and a cached result exists, the cached
214
+ output is loaded and optionally validated before returning.
215
+
216
+ :return: None.
217
+ :rtype: None
218
+ """
219
+ if self._load_results_from_cache():
220
+ self._validate_results_if_needed()
221
+ return
222
+
223
+ await self._async_fetch()
224
+
225
+ def fetch(self) -> None:
226
+ """Fetch collections and resources from the DTS endpoint and build results.
227
+
228
+ This method runs the main fetching and processing pipeline, which includes:
229
+ - Walking collections starting from the specified collection_id, applying exclusions and metadata fetching as configured.
230
+ - Fetching resources linked from the collections, applying metadata fetching and filtering as configured.
231
+ - Building the final results dictionary with collection and resource results, along with stats and version info
232
+ - Optionally saving the results to a JSON file if output_path is configured.
233
+ """
234
+ if self._load_results_from_cache():
235
+ self._validate_results_if_needed()
236
+ return
237
+
238
+ try:
239
+ asyncio.get_running_loop()
240
+ except RuntimeError:
241
+ asyncio.run(self._async_fetch())
242
+ return
243
+
244
+ _run_coro_in_thread(self._async_fetch)
245
+
246
+ def results(self) -> dict[str, Any]:
247
+ """Get the results of the fetch operation, including collection and resource results, stats, and version info.
248
+
249
+ :return: A dictionary containing the results of the fetch operation, with keys "dtsVersion", "type", "meta", "collection_results", and "resource_results". If fetch has not been called yet, returns an empty dictionary.
250
+ :rtype: dict[str, Any]
251
+ """
252
+ return self._results or {}
253
+
254
+ def collection_results(self) -> dict[str, Any]:
255
+ """Get only the collection results from the fetch operation.
256
+
257
+ :return: A dictionary containing the collection results, with keys "dtsVersion", "type", and "collection_results". If fetch has not been called yet, returns an empty dictionary.
258
+ :rtype: dict[str, Any]
259
+ """
260
+ return {
261
+ "dtsVersion": "1-alpha",
262
+ "type": "Collection",
263
+ "collection_results": self.results().get("collection_results", []),
264
+ }
265
+
266
+ def resource_results(self) -> dict[str, Any]:
267
+ """Get only the resource results from the fetch operation.
268
+
269
+ :return: A dictionary containing the resource results, with keys "dtsVersion", "type", and "resource_results". If fetch has not been called yet, returns an empty dictionary.
270
+ :rtype: dict[str, Any]
271
+ """
272
+ return {
273
+ "dtsVersion": "1-alpha",
274
+ "type": "Resource",
275
+ "resource_results": self.results().get("resource_results", []),
276
+ }
277
+
278
+ def stats(self) -> dict[str, Any]:
279
+ """Get the statistics collected during the fetch operation.
280
+
281
+ :return: A dictionary containing the statistics collected during the fetch operation, such as counts of collections and resources fetched, HTTP errors, and timing information. If fetch has not been called yet, returns an empty dictionary.
282
+ :rtype: dict[str, Any]
283
+ """
284
+ return self._stats.to_dict()
285
+
286
+ def _write_results_if_needed(self) -> None:
287
+ """Write the results to a JSON file if output_path is configured.
288
+ If output_path is not set, this method does nothing. If output_path is set, it ensures the parent directory exists and writes the results of the fetch operation to the specified file in JSON format with UTF-8 encoding.
289
+ """
290
+ if not self.config.output_path:
291
+ return
292
+ p = Path(self.config.output_path)
293
+ p.parent.mkdir(parents=True, exist_ok=True)
294
+ p.write_text(json.dumps(self.results(), ensure_ascii=False, indent=2), encoding="utf-8")
295
+
296
+ # ---------------- INTERNAL ---------------- #
297
+
298
+ def _load_results_from_cache(self) -> bool:
299
+ """Load cached JSON results if enabled and available."""
300
+ if not self.config.use_cache:
301
+ return False
302
+ if not self.config.output_path:
303
+ return False
304
+
305
+ p = Path(self.config.output_path)
306
+ if not p.exists():
307
+ return False
308
+
309
+ try:
310
+ self._results = json.loads(p.read_text(encoding="utf-8"))
311
+ return isinstance(self._results, dict)
312
+ except Exception:
313
+ self._results = None
314
+ return False
315
+
316
+ def _write_cache_csv_if_needed(self) -> None:
317
+ """Write a flat CSV cache/index for fetched resources."""
318
+ if not self.config.cache_csv_path:
319
+ return
320
+
321
+ results = self.results()
322
+ resources = results.get("resource_results", [])
323
+ if not isinstance(resources, list):
324
+ return
325
+
326
+ path = Path(self.config.cache_csv_path)
327
+ path.parent.mkdir(parents=True, exist_ok=True)
328
+
329
+ flattened_rows: list[dict[str, str]] = []
330
+ dynamic_fields: set[str] = set()
331
+
332
+ for resource in resources:
333
+ metadata = canonicalize_metadata_keys(resource.get("metadata") or {})
334
+ fragments = resource.get("fragments") or []
335
+
336
+ text_length = sum(len((frag.get("content") or "")) for frag in fragments)
337
+
338
+ row: dict[str, str] = {
339
+ "id": str(resource.get("id", "")),
340
+ "title": str(resource.get("title", "") or ""),
341
+ "linked_parents": json.dumps(
342
+ resource.get("linked_parents", []),
343
+ ensure_ascii=False,
344
+ ),
345
+ "fragments_count": str(len(fragments)),
346
+ "text_length": str(text_length),
347
+ }
348
+
349
+ flat_meta = _flatten_for_csv(metadata)
350
+ row.update(flat_meta)
351
+ dynamic_fields.update(flat_meta.keys())
352
+
353
+ flattened_rows.append(row)
354
+
355
+ fieldnames = [
356
+ "id",
357
+ "title",
358
+ "linked_parents",
359
+ "fragments_count",
360
+ "text_length",
361
+ *sorted(dynamic_fields),
362
+ ]
363
+
364
+ with path.open("w", encoding="utf-8", newline="") as handle:
365
+ writer = csv.DictWriter(handle, fieldnames=fieldnames, extrasaction="ignore")
366
+ writer.writeheader()
367
+ for row in flattened_rows:
368
+ writer.writerow(row)
369
+
370
+ def _make_fetcher(self) -> Fetcher:
371
+ """Create and return a Fetcher instance based on the configuration.
372
+ This method checks the fetcher type specified in the configuration and initializes either a GoFetcher or
373
+ an HttpxFetcher with the appropriate parameters. The GoFetcher is initialized with parameters specific to the Go implementation, while the HttpxFetcher is initialized with parameters suitable for Python HTTP requests. The method returns an instance of Fetcher that can be used for making requests to the DTS endpoint.
374
+
375
+ :return: An instance of Fetcher (either GoFetcher or HttpxFetcher) initialized according to the configuration.
376
+ :rtype: Fetcher
377
+ """
378
+ # default python/httpx
379
+ return HttpxFetcher(
380
+ endpoint=self.config.endpoint_dts,
381
+ timeout=self.config.request_timeout,
382
+ concurrency=self.config.concurrency,
383
+ retries=self.config.retries,
384
+ backoff_ms=self.config.backoff_ms,
385
+ stats=self._stats,
386
+ )
387
+
388
+ def close(self):
389
+ """Close any resources used by the ThunderDots client, such as thread pools or fetcher connections."""
390
+ if self._executor:
391
+ self._executor.shutdown(wait=False)
392
+ self._executor = None
393
+
394
+ async def _async_fetch(self) -> None:
395
+ """Asynchronous implementation of the fetch operation, which performs the main fetching and processing pipeline."""
396
+ loop = asyncio.get_running_loop()
397
+ if self._executor is None:
398
+ self._executor = concurrent.futures.ThreadPoolExecutor(max_workers=32)
399
+ loop.set_default_executor(self._executor)
400
+
401
+ self._stats.start()
402
+ ui = UI(enabled=self.config.verbose)
403
+
404
+ fetcher: Fetcher = self._make_fetcher()
405
+
406
+ async with ui:
407
+ try:
408
+ ui.start_walk()
409
+
410
+ collections, resources = await walk_collections(
411
+ fetcher, self.config, self._stats, ui=ui
412
+ )
413
+
414
+ ui.start_resources(total=len(resources))
415
+
416
+ resource_results = await fetch_resources(
417
+ fetcher, self.config, resources, self._stats, ui=ui
418
+ )
419
+
420
+ self._results = build_output(
421
+ collections,
422
+ resource_results,
423
+ self._stats,
424
+ _package_version(),
425
+ collection_metadata_dublincore=self.config.collection_params.metadata_dublincore,
426
+ collection_metadata_extensions=self.config.collection_params.metadata_extensions,
427
+ )
428
+
429
+ if self.config.validate:
430
+ self._validate_results_if_needed()
431
+ self._write_results_if_needed()
432
+ self._write_cache_csv_if_needed()
433
+
434
+ finally:
435
+ try:
436
+ await fetcher.aclose()
437
+ except Exception:
438
+ pass
439
+
440
+ self._stats.stop()
441
+ ui.finalize(self._stats.to_dict())
442
+
443
+ def notices(self) -> list[DotsNotice]:
444
+ """Convert the resource results into a list of DotsNotice objects, which are ORM representations of the notices that can be used for further processing or output formatting."""
445
+ return [
446
+ DotsNotice.from_resource_result(item)
447
+ for item in self.results().get("resource_results", [])
448
+ ]
449
+
450
+ def to_elastic_documents(
451
+ self, *, include_fragments: bool = True, include_raw: bool = False
452
+ ) -> list[dict[str, Any]]:
453
+ """Convert the resource results into a list of dictionaries formatted as ElasticSearch documents, with options to include fragments and raw metadata. Each notice is transformed into a format suitable for indexing in ElasticSearch, with fields such as "id", "title, "linked_parents", "metadata", and optionally "fragments" and "raw_metadata" based on the parameters provided.
454
+
455
+ :param include_fragments: Whether to include the "fragments" field in the output documents (default: True).
456
+ :type include_fragments: bool
457
+ :param include_raw: Whether to include the "raw_metadata" field in the output documents
458
+ (default: False).
459
+ :type include_raw: bool
460
+ :return: A list of dictionaries, each representing an ElasticSearch document for a notice,
461
+ with fields formatted according to the DotsNotice.to_elastic_document method and the specified parameters.
462
+ :rtype: list[dict[str, Any]]
463
+ """
464
+ return [
465
+ notice.to_elastic_document(
466
+ include_fragments=include_fragments,
467
+ include_raw=include_raw,
468
+ )
469
+ for notice in self.notices()
470
+ ]
471
+
472
+ def to_elastic_actions(
473
+ self,
474
+ *,
475
+ index: str,
476
+ include_fragments: bool = True,
477
+ include_raw: bool = False,
478
+ ) -> list[dict[str, Any]]:
479
+ """Convert the resource results into a list of dictionaries formatted as ElasticSearch bulk API actions, with options to include fragments and raw metadata. Each notice is transformed into a format suitable for bulk indexing in ElasticSearch, with an action dictionary containing the index name and document ID, followed by the document itself formatted according to the DotsNotice.to_elastic_action method and the specified parameters.
480
+
481
+ :param index: The name of the ElasticSearch index to use in the bulk actions.
482
+ :type index: str
483
+ :param include_fragments: Whether to include the "fragments" field in the output
484
+ documents (default: True).
485
+ :type include_fragments: bool
486
+ :param include_raw: Whether to include the "raw_metadata" field in the output documents
487
+ (default: False).
488
+ :type include_raw: bool
489
+ :return: A list of dictionaries, each representing an ElasticSearch bulk API action for a
490
+ notice, with the appropriate index and document ID, and the document formatted according to the DotsNotice.to_elastic_action method and the specified parameters.
491
+ :rtype: list[dict[str, Any]]
492
+ """
493
+ return [
494
+ notice.to_elastic_action(
495
+ index=index,
496
+ include_fragments=include_fragments,
497
+ include_raw=include_raw,
498
+ )
499
+ for notice in self.notices()
500
+ ]
501
+
502
+ def to_qdrant_payloads(
503
+ self, *, include_fragments: bool = True, include_raw: bool = False
504
+ ) -> list[dict[str, Any]]:
505
+ """Convert the resource results into a list of dictionaries formatted as Qdrant payloads, with options to include fragments and raw metadata. Each notice is transformed into a format suitable for indexing in Qdrant, with fields such as "id", "title, "linked_parents", "metadata", and optionally "fragments" and "raw_metadata" based on the parameters provided.
506
+
507
+ :param include_fragments: Whether to include the "fragments" field in the output payloads (default: True).
508
+ :type include_fragments: bool
509
+ :param include_raw: Whether to include the "raw_metadata" field in the output payload
510
+ (default: False).
511
+ :type include_raw: bool
512
+ :return: A list of dictionaries, each representing a Qdrant payload for a notice
513
+ with fields formatted according to the DotsNotice.to_qdrant_payload method and the specified parameters.
514
+ :rtype: list[dict[str, Any]]
515
+ """
516
+ return [
517
+ notice.to_qdrant_payload(
518
+ include_fragments=include_fragments,
519
+ include_raw=include_raw,
520
+ )
521
+ for notice in self.notices()
522
+ ]
523
+
524
+ def to_qdrant_points(
525
+ self,
526
+ *,
527
+ vectors: list[list[float] | dict[str, Any]] | None = None,
528
+ include_fragments: bool = True,
529
+ include_raw: bool = False,
530
+ ) -> list[dict[str, Any]]:
531
+ """Convert the resource results into a list of dictionaries formatted as Qdrant points, with options to include fragments, raw metadata, and associated vectors. Each notice is transformed into a format suitable for indexing in Qdrant, with fields such as "id", "title, "linked_parents", "metadata", and optionally "fragments" and raw_metadata" based on the parameters provided. If vectors are provided, they are included in the point data for each notice.
532
+
533
+ :param vectors: An optional list of vectors to include in the point data for each notice. If provided, the length of this list must match the number of notices, and each vector will be included in the corresponding notice's point data (default: None).
534
+ :type vectors: list[list[float] | dict[str, Any]] | None
535
+ :param include_fragments: Whether to include the "fragments" field in the output
536
+ points (default: True).
537
+ :type include_fragments: bool
538
+ :param include_raw: Whether to include the "raw_metadata" field in the output points
539
+ (default: False).
540
+ :type include_raw: bool
541
+ :return: A list of dictionaries, each representing a Qdrant point for a notice
542
+ with fields formatted according to the DotsNotice.to_qdrant_point method and the specified parameters, including vectors if provided.
543
+ :rtype: list[dict[str, Any]]
544
+ """
545
+ notices = self.notices()
546
+
547
+ if vectors is not None and len(vectors) != len(notices):
548
+ raise ValueError(
549
+ f"vectors length mismatch: got {len(vectors)} vectors for {len(notices)} notices"
550
+ )
551
+
552
+ return [
553
+ notice.to_qdrant_point(
554
+ vector=None if vectors is None else vectors[index],
555
+ include_fragments=include_fragments,
556
+ include_raw=include_raw,
557
+ )
558
+ for index, notice in enumerate(notices)
559
+ ]