thunderdots 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. thunderdots-0.1.0/.github/workflows/ci.yml +144 -0
  2. thunderdots-0.1.0/.github/workflows/release.yml +96 -0
  3. thunderdots-0.1.0/.gitignore +34 -0
  4. thunderdots-0.1.0/CITATION.cff +47 -0
  5. thunderdots-0.1.0/LICENSE.md +21 -0
  6. thunderdots-0.1.0/Makefile +149 -0
  7. thunderdots-0.1.0/PKG-INFO +194 -0
  8. thunderdots-0.1.0/README.md +157 -0
  9. thunderdots-0.1.0/RELEASE.md +174 -0
  10. thunderdots-0.1.0/assets/dots-light.png +0 -0
  11. thunderdots-0.1.0/assets/dots-logo-retro.drawio.png +0 -0
  12. thunderdots-0.1.0/main.py +459 -0
  13. thunderdots-0.1.0/mkdocs/docs/api-reference.md +70 -0
  14. thunderdots-0.1.0/mkdocs/docs/assets/dots-light.png +0 -0
  15. thunderdots-0.1.0/mkdocs/docs/assets/dots-logo-retro.drawio.png +0 -0
  16. thunderdots-0.1.0/mkdocs/docs/assets/logo-chartes.png +0 -0
  17. thunderdots-0.1.0/mkdocs/docs/caching-async.md +38 -0
  18. thunderdots-0.1.0/mkdocs/docs/configuration.md +256 -0
  19. thunderdots-0.1.0/mkdocs/docs/exports.md +81 -0
  20. thunderdots-0.1.0/mkdocs/docs/fragmentation.md +109 -0
  21. thunderdots-0.1.0/mkdocs/docs/index.md +64 -0
  22. thunderdots-0.1.0/mkdocs/docs/installation.md +56 -0
  23. thunderdots-0.1.0/mkdocs/docs/metadata-validation.md +56 -0
  24. thunderdots-0.1.0/mkdocs/docs/notebooks/artifacts/thunderdots/resources_cache.csv +25 -0
  25. thunderdots-0.1.0/mkdocs/docs/notebooks/artifacts/thunderdots/results.json +1522 -0
  26. thunderdots-0.1.0/mkdocs/docs/notebooks/thunderdots_playground.ipynb +4019 -0
  27. thunderdots-0.1.0/mkdocs/docs/quickstart.md +67 -0
  28. thunderdots-0.1.0/mkdocs/docs/stylesheets/extra.css +32 -0
  29. thunderdots-0.1.0/mkdocs/docs/troubleshooting.md +25 -0
  30. thunderdots-0.1.0/mkdocs/mkdocs.yml +91 -0
  31. thunderdots-0.1.0/mkdocs/overrides/partials/copyright.html +10 -0
  32. thunderdots-0.1.0/pyproject.toml +64 -0
  33. thunderdots-0.1.0/pytest.ini +4 -0
  34. thunderdots-0.1.0/setup.cfg +4 -0
  35. thunderdots-0.1.0/tests/conftest.py +109 -0
  36. thunderdots-0.1.0/tests/fixtures/json/collection_encpos_2025.json +70 -0
  37. thunderdots-0.1.0/tests/fixtures/json/navigation_encpos_2025_01.json +22 -0
  38. thunderdots-0.1.0/tests/fixtures/json/resource_encpos_2025_01.json +28 -0
  39. thunderdots-0.1.0/tests/fixtures/xml/encpos_1893_05.xml +35 -0
  40. thunderdots-0.1.0/tests/fixtures/xml/smcp_pr_0004.xml +19 -0
  41. thunderdots-0.1.0/tests/test_client_offline.py +135 -0
  42. thunderdots-0.1.0/tests/test_exports_and_validation.py +118 -0
  43. thunderdots-0.1.0/tests/test_fixtures_integrity.py +33 -0
  44. thunderdots-0.1.0/tests/test_metadata_and_config.py +116 -0
  45. thunderdots-0.1.0/tests/test_online_dts.py +86 -0
  46. thunderdots-0.1.0/tests/test_tei_extraction.py +100 -0
  47. thunderdots-0.1.0/thunderdots/__init__.py +12 -0
  48. thunderdots-0.1.0/thunderdots/__version__.py +24 -0
  49. thunderdots-0.1.0/thunderdots/client.py +559 -0
  50. thunderdots-0.1.0/thunderdots/config.py +204 -0
  51. thunderdots-0.1.0/thunderdots/extract/resources.py +220 -0
  52. thunderdots-0.1.0/thunderdots/extract/tei.py +648 -0
  53. thunderdots-0.1.0/thunderdots/extract/walker.py +139 -0
  54. thunderdots-0.1.0/thunderdots/fetcher.py +260 -0
  55. thunderdots-0.1.0/thunderdots/normalize/__init__.py +3 -0
  56. thunderdots-0.1.0/thunderdots/normalize/dates.py +193 -0
  57. thunderdots-0.1.0/thunderdots/normalize/metadata.py +130 -0
  58. thunderdots-0.1.0/thunderdots/normalize/output.py +69 -0
  59. thunderdots-0.1.0/thunderdots/orm.py +383 -0
  60. thunderdots-0.1.0/thunderdots/stats.py +52 -0
  61. thunderdots-0.1.0/thunderdots/ui.py +161 -0
  62. thunderdots-0.1.0/thunderdots/validation/__init__.py +10 -0
  63. thunderdots-0.1.0/thunderdots/validation/models.py +61 -0
  64. thunderdots-0.1.0/thunderdots/validation/rules.py +63 -0
  65. thunderdots-0.1.0/thunderdots/validation/schemas.py +103 -0
  66. thunderdots-0.1.0/thunderdots/validation/validators.py +114 -0
  67. thunderdots-0.1.0/thunderdots.egg-info/PKG-INFO +194 -0
  68. thunderdots-0.1.0/thunderdots.egg-info/SOURCES.txt +70 -0
  69. thunderdots-0.1.0/thunderdots.egg-info/dependency_links.txt +1 -0
  70. thunderdots-0.1.0/thunderdots.egg-info/requires.txt +24 -0
  71. thunderdots-0.1.0/thunderdots.egg-info/top_level.txt +1 -0
  72. thunderdots-0.1.0/uv.lock +2911 -0
@@ -0,0 +1,144 @@
1
+ name: CI/CD
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - "**"
7
+ pull_request:
8
+ branches:
9
+ - "**"
10
+ workflow_dispatch:
11
+
12
+ permissions:
13
+ contents: read
14
+
15
+ concurrency:
16
+ group: ci-${{ github.ref }}
17
+ cancel-in-progress: true
18
+
19
+ jobs:
20
+ python-lint:
21
+ name: Python lint and format
22
+ runs-on: ubuntu-latest
23
+
24
+ steps:
25
+ - name: Checkout
26
+ uses: actions/checkout@v4
27
+
28
+ - name: Setup uv
29
+ uses: astral-sh/setup-uv@v5
30
+ with:
31
+ enable-cache: true
32
+
33
+ - name: Setup Python
34
+ run: uv python install 3.12
35
+
36
+ - name: Install dependencies
37
+ run: uv sync --all-extras --dev
38
+
39
+ - name: Ruff format check
40
+ run: uv run ruff format --check
41
+
42
+ - name: Ruff lint
43
+ run: uv run ruff check
44
+
45
+ python-tests:
46
+ name: Python unit tests
47
+ runs-on: ubuntu-latest
48
+ needs: python-lint
49
+
50
+ steps:
51
+ - name: Checkout
52
+ uses: actions/checkout@v4
53
+
54
+ - name: Setup uv
55
+ uses: astral-sh/setup-uv@v5
56
+ with:
57
+ enable-cache: true
58
+
59
+ - name: Setup Python
60
+ run: uv python install 3.12
61
+
62
+ - name: Install dependencies
63
+ run: uv sync --all-extras --dev
64
+
65
+ - name: Run unit tests
66
+ run: uv run pytest
67
+
68
+ package-build:
69
+ name: Build Python package
70
+ runs-on: ubuntu-latest
71
+ needs: python-tests
72
+
73
+ steps:
74
+ - name: Checkout
75
+ uses: actions/checkout@v4
76
+ with:
77
+ fetch-depth: 0
78
+
79
+ - name: Setup uv
80
+ uses: astral-sh/setup-uv@v5
81
+ with:
82
+ enable-cache: true
83
+
84
+ - name: Setup Python
85
+ run: uv python install 3.12
86
+
87
+ - name: Install dependencies
88
+ run: uv sync --all-extras --dev
89
+
90
+ - name: Build package
91
+ run: uv build
92
+
93
+ - name: Check package metadata
94
+ run: uvx twine check dist/*
95
+
96
+ docs-build:
97
+ name: Build documentation
98
+ runs-on: ubuntu-latest
99
+ needs: python-tests
100
+
101
+ steps:
102
+ - name: Checkout
103
+ uses: actions/checkout@v4
104
+
105
+ - name: Setup uv
106
+ uses: astral-sh/setup-uv@v5
107
+ with:
108
+ enable-cache: true
109
+
110
+ - name: Setup Python
111
+ run: uv python install 3.12
112
+
113
+ - name: Install dependencies
114
+ run: uv sync --all-extras --dev
115
+
116
+ - name: Build MkDocs documentation
117
+ run: uv run mkdocs build --strict -f mkdocs/mkdocs.yml
118
+
119
+ - name: Upload GitHub Pages artifact
120
+ if: github.event_name == 'push' && github.ref == 'refs/heads/main'
121
+ uses: actions/upload-pages-artifact@v3
122
+ with:
123
+ path: mkdocs/site
124
+
125
+ docs-deploy:
126
+ name: Deploy documentation to GitHub Pages
127
+ runs-on: ubuntu-latest
128
+ needs: docs-build
129
+
130
+ if: github.event_name == 'push' && github.ref == 'refs/heads/main'
131
+
132
+ permissions:
133
+ contents: read
134
+ pages: write
135
+ id-token: write
136
+
137
+ environment:
138
+ name: github-pages
139
+ url: ${{ steps.deployment.outputs.page_url }}
140
+
141
+ steps:
142
+ - name: Deploy to GitHub Pages
143
+ id: deployment
144
+ uses: actions/deploy-pages@v4
@@ -0,0 +1,96 @@
1
+ name: Release to PyPI
2
+
3
+ on:
4
+ release:
5
+ types:
6
+ - published
7
+
8
+ permissions:
9
+ contents: read
10
+ id-token: write
11
+
12
+ jobs:
13
+ pypi-release:
14
+ name: Build and publish to PyPI
15
+ runs-on: ubuntu-latest
16
+
17
+ environment:
18
+ name: pypi
19
+ url: https://pypi.org/project/thunderdots/
20
+
21
+ steps:
22
+ - name: Checkout
23
+ uses: actions/checkout@v4
24
+ with:
25
+ fetch-depth: 0
26
+
27
+ - name: Setup uv
28
+ uses: astral-sh/setup-uv@v5
29
+ with:
30
+ enable-cache: true
31
+
32
+ - name: Setup Python
33
+ run: uv python install 3.12
34
+
35
+ - name: Install dependencies
36
+ run: uv sync --all-extras --dev
37
+
38
+ - name: Check release tag format
39
+ run: |
40
+ uv run python - <<'PY'
41
+ import os
42
+ import re
43
+
44
+ tag = os.environ["GITHUB_REF_NAME"]
45
+
46
+ if not re.fullmatch(r"v\d+\.\d+\.\d+([a-zA-Z0-9.\-]+)?", tag):
47
+ raise SystemExit(
48
+ f"Invalid release tag '{tag}'. Expected something like v0.1.0"
49
+ )
50
+
51
+ print(f"Release tag format OK: {tag}")
52
+ PY
53
+
54
+ - name: Build package
55
+ run: uv build
56
+
57
+ - name: Check built package version
58
+ run: |
59
+ uv run python - <<'PY'
60
+ import os
61
+ import zipfile
62
+ from pathlib import Path
63
+ from email.parser import Parser
64
+
65
+ tag = os.environ["GITHUB_REF_NAME"]
66
+ expected = tag[1:] if tag.startswith("v") else tag
67
+
68
+ wheels = list(Path("dist").glob("*.whl"))
69
+ if not wheels:
70
+ raise SystemExit("No wheel found in dist/")
71
+
72
+ wheel = wheels[0]
73
+ with zipfile.ZipFile(wheel) as zf:
74
+ metadata_name = next(
75
+ name for name in zf.namelist()
76
+ if name.endswith(".dist-info/METADATA")
77
+ )
78
+ metadata = Parser().parsestr(
79
+ zf.read(metadata_name).decode("utf-8")
80
+ )
81
+
82
+ actual = metadata["Version"]
83
+
84
+ if actual != expected:
85
+ raise SystemExit(
86
+ f"Built package version '{actual}' does not match release tag '{tag}'"
87
+ )
88
+
89
+ print(f"Built package version OK: {actual}")
90
+ PY
91
+
92
+ - name: Check package metadata
93
+ run: uvx twine check dist/*
94
+
95
+ - name: Publish package to PyPI
96
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,34 @@
1
+ # Python
2
+
3
+ .venv/
4
+ venv/
5
+ env/
6
+ ENV/
7
+ env.bak/
8
+ venv.bak/
9
+ __pycache__/
10
+
11
+ .cache/
12
+
13
+ # Jupyter Notebook
14
+ .ipynb_checkpoints/
15
+
16
+ # VS Code
17
+ .vscode/
18
+
19
+ # PyCharm
20
+ .idea/
21
+
22
+ # MacOS
23
+ .DS_Store
24
+
25
+ # scripts
26
+ project_dump.txt
27
+ dump_project.py
28
+ specs
29
+ out_results/
30
+ thunderdots/native/build/*mkdocs/site/
31
+ site/
32
+
33
+ *.egg-info/
34
+ __version__.py
@@ -0,0 +1,47 @@
1
+ cff-version: 1.2.0
2
+ message: "If you use this software, please cite it using the metadata from this file."
3
+ title: "ThunderDots"
4
+ abstract: >
5
+ A Python client for Distributed Text Services (DTS) endpoints, initially built
6
+ for DoTS. ThunderDots walks DTS collections and subcollections, fetches
7
+ resources and TEI/XML documents, extracts text fragments from full documents,
8
+ DTS navigation, or custom TEI XPath rules, filters Dublin Core and extension
9
+ metadata, validates generated outputs, and exports records for indexing,
10
+ full-text search, RAG, vector databases, or corpus-analysis pipelines.
11
+ type: software
12
+ authors:
13
+ - family-names: Terriel
14
+ given-names: Lucas
15
+ orcid: "https://orcid.org/0000-0002-9189-258X"
16
+ affiliation: "École nationale des chartes – PSL"
17
+ repository-code: "https://github.com/chartes/thunderdots"
18
+ url: "https://github.com/chartes/thunderdots"
19
+ license: MIT
20
+ keywords:
21
+ - DTS
22
+ - Distributed Text Services
23
+ - TEI
24
+ - XML
25
+ - text extraction
26
+ - digital humanities
27
+ - cultural heritage
28
+ - corpus analysis
29
+ - full-text search
30
+ - RAG
31
+ - Elasticsearch
32
+ - Qdrant
33
+ - Python
34
+ version: "0.1.0"
35
+ date-released: "2026-01-01"
36
+ preferred-citation:
37
+ type: software
38
+ title: "ThunderDots"
39
+ authors:
40
+ - family-names: Terriel
41
+ given-names: Lucas
42
+ orcid: "https://orcid.org/0000-0002-9189-258X"
43
+ affiliation: "École nationale des chartes – PSL"
44
+ repository-code: "https://github.com/chartes/thunderdots"
45
+ url: "https://github.com/chartes/thunderdots"
46
+ license: MIT
47
+ date-released: "2026-01-01"
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 ThunderDots contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,149 @@
1
+ .PHONY: help \
2
+ dev.sync dev.install \
3
+ docs.prepare docs.serve docs.build docs.build-dev docs.clean docs.check docs.deploy \
4
+ tests.unit tests.network tests.all \
5
+ lint.check lint.format \
6
+ check clean
7
+
8
+ # ---------------------------------------------------------------------
9
+ # Tooling
10
+ # ---------------------------------------------------------------------
11
+
12
+ UV ?= uv
13
+ PYTHON ?= $(UV) run python
14
+ PYTEST ?= $(PYTHON) -m pytest
15
+ RUFF ?= $(PYTHON) -m ruff
16
+ MKDOCS ?= $(PYTHON) -m mkdocs
17
+
18
+ # ---------------------------------------------------------------------
19
+ # Paths
20
+ # ---------------------------------------------------------------------
21
+
22
+ DOCS_DIR ?= mkdocs
23
+ DOCS_CONFIG ?= $(DOCS_DIR)/mkdocs.yml
24
+ DOCS_SRC ?= $(DOCS_DIR)/docs
25
+ DOCS_SITE ?= $(DOCS_DIR)/site
26
+
27
+ ROOT_ASSETS_DIR ?= assets
28
+ DOCS_ASSETS_DIR ?= $(DOCS_SRC)/assets
29
+
30
+ NOTEBOOKS_DIR ?= notebooks
31
+ DOCS_NOTEBOOKS_DIR ?= $(DOCS_SRC)/notebooks
32
+ USER_NOTEBOOK ?= thunderdots_documentation_utilisateur.ipynb
33
+
34
+ # ---------------------------------------------------------------------
35
+ # Help
36
+ # ---------------------------------------------------------------------
37
+
38
+ help:
39
+ @echo "ThunderDots development commands"
40
+ @echo ""
41
+ @echo "Environment:"
42
+ @echo " make dev.sync Sync project dependencies with uv"
43
+ @echo " make dev.install Install project with dev and docs extras"
44
+ @echo ""
45
+ @echo "Documentation:"
46
+ @echo " make docs.prepare Copy assets and notebooks into MkDocs docs_dir"
47
+ @echo " make docs.serve Serve MkDocs locally"
48
+ @echo " make docs.build Build documentation in strict mode"
49
+ @echo " make docs.build-dev Build documentation without strict mode"
50
+ @echo " make docs.check Validate documentation build"
51
+ @echo " make docs.deploy Deploy documentation to gh-pages"
52
+ @echo " make docs.clean Remove generated documentation site"
53
+ @echo ""
54
+ @echo "Tests:"
55
+ @echo " make tests.unit Run unit tests without network tests"
56
+ @echo " make tests.network Run tests with online DTS tests"
57
+ @echo " make tests.all Run all tests"
58
+ @echo ""
59
+ @echo "Linting:"
60
+ @echo " make lint.check Run Ruff checks"
61
+ @echo " make lint.format Format code with Ruff"
62
+ @echo ""
63
+ @echo "Global:"
64
+ @echo " make check Run lint + tests + docs check"
65
+ @echo " make clean Remove generated caches and build artefacts"
66
+
67
+ # ---------------------------------------------------------------------
68
+ # Environment
69
+ # ---------------------------------------------------------------------
70
+
71
+ dev.sync:
72
+ $(UV) sync --extra dev --extra docs
73
+
74
+ dev.install:
75
+ $(UV) pip install -e ".[dev,docs]"
76
+
77
+ # ---------------------------------------------------------------------
78
+ # Documentation
79
+ # ---------------------------------------------------------------------
80
+
81
+ docs.prepare:
82
+ mkdir -p $(DOCS_ASSETS_DIR)
83
+ mkdir -p $(DOCS_NOTEBOOKS_DIR)
84
+ @if [ -d "$(ROOT_ASSETS_DIR)" ]; then \
85
+ cp -R $(ROOT_ASSETS_DIR)/* $(DOCS_ASSETS_DIR)/ 2>/dev/null || true; \
86
+ fi
87
+ @if [ -f "$(NOTEBOOKS_DIR)/$(USER_NOTEBOOK)" ]; then \
88
+ cp "$(NOTEBOOKS_DIR)/$(USER_NOTEBOOK)" "$(DOCS_NOTEBOOKS_DIR)/$(USER_NOTEBOOK)"; \
89
+ fi
90
+
91
+ docs.serve: docs.prepare
92
+ $(MKDOCS) serve -f $(DOCS_CONFIG)
93
+
94
+ docs.build: docs.prepare
95
+ $(MKDOCS) build -f $(DOCS_CONFIG) --strict
96
+
97
+ docs.build-dev: docs.prepare
98
+ $(MKDOCS) build -f $(DOCS_CONFIG)
99
+
100
+ docs.clean:
101
+ rm -rf $(DOCS_SITE)
102
+
103
+ docs.check: docs.build
104
+
105
+ docs.deploy: docs.prepare
106
+ $(MKDOCS) gh-deploy -f $(DOCS_CONFIG) --force
107
+
108
+ # ---------------------------------------------------------------------
109
+ # Tests
110
+ # ---------------------------------------------------------------------
111
+
112
+ tests.unit:
113
+ $(PYTEST)
114
+
115
+ tests.network:
116
+ RUN_NETWORK_TESTS=1 $(PYTEST)
117
+
118
+ tests.all: tests.network
119
+
120
+ # ---------------------------------------------------------------------
121
+ # Linting / formatting
122
+ # ---------------------------------------------------------------------
123
+
124
+ lint.check:
125
+ $(RUFF) check thunderdots tests
126
+
127
+ lint.format:
128
+ $(RUFF) format thunderdots tests
129
+ $(RUFF) check thunderdots tests --fix
130
+
131
+ # ---------------------------------------------------------------------
132
+ # Global checks
133
+ # ---------------------------------------------------------------------
134
+
135
+ check: lint.check tests.unit docs.check
136
+
137
+ # ---------------------------------------------------------------------
138
+ # Cleaning
139
+ # ---------------------------------------------------------------------
140
+
141
+ clean: docs.clean
142
+ rm -rf .pytest_cache
143
+ rm -rf .ruff_cache
144
+ rm -rf htmlcov
145
+ rm -rf dist
146
+ rm -rf build
147
+ rm -rf *.egg-info
148
+ find . -type d -name "__pycache__" -prune -exec rm -rf {} +
149
+ find . -type f -name "*.pyc" -delete
@@ -0,0 +1,194 @@
1
+ Metadata-Version: 2.4
2
+ Name: thunderdots
3
+ Version: 0.1.0
4
+ Summary: ThunderDots is a Python client for DTS (Distributed Text Services) endpoints, initially built for DoTS.
5
+ Author: Lucas Terriel, École nationale des chartes - PSL
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/chartes/thunderdots
8
+ Project-URL: Repository, https://github.com/chartes/thunderdots
9
+ Project-URL: Documentation, https://chartes.github.io/thunderdots/
10
+ Project-URL: Issues, https://github.com/chartes/thunderdots/issues
11
+ Keywords: dts,tei,xml,digital-humanities,corpus,dots
12
+ Requires-Python: >=3.10
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE.md
15
+ Requires-Dist: edtf>=5.0.1
16
+ Requires-Dist: h2>=4.3.0
17
+ Requires-Dist: httpx[http2]>=0.28.1
18
+ Requires-Dist: jsonschema>=4.26.0
19
+ Requires-Dist: lxml>=6.0.2
20
+ Requires-Dist: notebook>=7.5.6
21
+ Requires-Dist: rich>=14.3.2
22
+ Requires-Dist: tqdm>=4.67.3
23
+ Provides-Extra: dev
24
+ Requires-Dist: pytest>=9.0.2; extra == "dev"
25
+ Requires-Dist: ruff>=0.15.1; extra == "dev"
26
+ Requires-Dist: build; extra == "dev"
27
+ Requires-Dist: twine; extra == "dev"
28
+ Provides-Extra: docs
29
+ Requires-Dist: mkdocs>=1.6.1; extra == "docs"
30
+ Requires-Dist: mkdocs-material>=9.5; extra == "docs"
31
+ Requires-Dist: mkdocs-jupyter>=0.25; extra == "docs"
32
+ Requires-Dist: jupyter; extra == "docs"
33
+ Requires-Dist: ipykernel; extra == "docs"
34
+ Provides-Extra: all
35
+ Requires-Dist: thunderdots[dev,docs]; extra == "all"
36
+ Dynamic: license-file
37
+
38
+ <h1 align="center">
39
+ <img src="assets/dots-light.png" width="450"><br>
40
+ ThunderDots — DTS client for documentary corpora
41
+ </h1>
42
+
43
+ <p align="center">
44
+ <strong>Fast DTS crawling, TEI fragmentation, metadata filtering, validation, and export pipelines.</strong>
45
+ </p>
46
+
47
+ <p align="center">
48
+ <a href="https://github.com/astral-sh/uv">
49
+ <img src="https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/uv/main/assets/badge/v0.json" alt="uv">
50
+ </a>
51
+ <a href="https://github.com/astral-sh/ruff">
52
+ <img src="https://img.shields.io/badge/lint-ruff-0A0A0A?logo=ruff&logoColor=white" alt="ruff">
53
+ </a>
54
+ <a href="https://github.com/chartes/thunderdots/actions/workflows/ci.yml">
55
+ <img src="https://github.com/chartes/thunderdots/actions/workflows/ci.yml/badge.svg" alt="CI">
56
+ </a>
57
+ <a href="https://github.com/chartes/thunderdots/blob/master/LICENSE.md">
58
+ <img src="https://img.shields.io/badge/License-MIT-blue.svg" alt="License: MIT">
59
+ </a>
60
+ <a href="https://pypi.org/project/thunderdots/">
61
+ <img src="https://img.shields.io/pypi/v/thunderdots.svg" alt="PyPI version">
62
+ </a>
63
+ </p>
64
+
65
+ ---
66
+
67
+ ## Overview
68
+
69
+ **ThunderDots** is a Python client for [DTS](https://dtsapi.org/specifications/) (*Distributed Text Services*) endpoints, initially built for [DoTS](https://chartes.github.io/dots_documentation/).
70
+
71
+ It helps you move from a remote DTS API to structured Python objects and JSON records that can feed indexing pipelines, including full-text search, RAG/vector databases, and corpus-analysis workflows.
72
+
73
+ ThunderDots focuses on practical documentary workflows: crawling DTS collections, fetching TEI/XML resources, extracting reusable text fragments, selecting metadata, validating outputs, and exporting data to downstream search or indexing systems.
74
+
75
+ ---
76
+
77
+ ## What ThunderDots does
78
+
79
+ ThunderDots can:
80
+
81
+ - walk DTS collections and subcollections;
82
+ - fetch resources and TEI/XML documents;
83
+ - extract text fragments from full documents, DTS navigation, or custom TEI XPath rules;
84
+ - preserve or filter Dublin Core and extension metadata;
85
+ - enrich temporal metadata such as dates and coverage ranges;
86
+ - validate generated outputs with JSON Schema;
87
+ - export records to indexing pipelines such as Elasticsearch or Qdrant-compatible formats;
88
+ - cache fetched corpora as JSON and CSV;
89
+ - run synchronous or asynchronous workflows.
90
+
91
+ ---
92
+
93
+ ## Installation
94
+
95
+ ### With `uv`
96
+
97
+ ```bash
98
+ uv add thunderdots
99
+ ```
100
+
101
+ ### With pip
102
+
103
+ ```bash
104
+ pip install thunderdots
105
+ ```
106
+
107
+ ### For development
108
+
109
+ ```bash
110
+ git clone https://github.com/chartes/thunderdots.git
111
+ cd thunderdots
112
+
113
+ uv venv
114
+ source .venv/bin/activate
115
+ uv sync --all-extras --dev
116
+ ```
117
+
118
+ or with pip
119
+
120
+ ```bash
121
+ python -m venv .venv
122
+ source .venv/bin/activate
123
+ pip install -e ".[dev]"
124
+ ```
125
+
126
+ ## Minimal example
127
+
128
+ ```python
129
+ from thunderdots import ThunderDots
130
+
131
+ td = ThunderDots(
132
+ endpoint_dts="https://dots.chartes.psl.eu/api/dts",
133
+ collection_params={"collection_id": "ENCPOS_1900"},
134
+ resource_params={"fragment_mode": "document"},
135
+ )
136
+
137
+ td.fetch()
138
+ results = td.results()
139
+
140
+ print(td.stats())
141
+ ```
142
+
143
+ ## Development
144
+
145
+ ### Run tests
146
+
147
+ ```bash
148
+ pytest
149
+ ```
150
+
151
+ Online DTS tests are opt-in:
152
+
153
+ ```bash
154
+ RUN_NETWORK_TESTS=1 pytest
155
+ ```
156
+
157
+ ### Run Ruff (linter, format)
158
+
159
+ ```bash
160
+ ruff format --check
161
+ ruff check
162
+ ```
163
+
164
+ ### Build the documentation
165
+
166
+ ```bash
167
+ mkdocs build --strict -f mkdocs/mkdocs.yml
168
+ ```
169
+
170
+ ### Create a new PyPI release
171
+
172
+ Check the [release checklist](./RELEASE.md) for details.
173
+
174
+ ### License
175
+
176
+ ThunderDots is distributed under the [MIT License](./LICENSE.md).
177
+
178
+ ### Citation
179
+
180
+ If you use ThunderDots in academic work, please cite it as:
181
+
182
+ ```
183
+ @software{terriel_thunderdots_2026,
184
+ author = {Terriel, Lucas},
185
+ title = {ThunderDots},
186
+ year = {2026},
187
+ publisher = {GitHub},
188
+ institution = {{École nationale des chartes}},
189
+ url = {https://github.com/chartes/thunderdots},
190
+ note = {Python client for Distributed Text Services (DTS) via DoTS}
191
+ }
192
+ ```
193
+
194
+ You can also use the repository metadata from [CITATION.cff](./CITATION.cff).