thunderdots 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thunderdots-0.1.0/.github/workflows/ci.yml +144 -0
- thunderdots-0.1.0/.github/workflows/release.yml +96 -0
- thunderdots-0.1.0/.gitignore +34 -0
- thunderdots-0.1.0/CITATION.cff +47 -0
- thunderdots-0.1.0/LICENSE.md +21 -0
- thunderdots-0.1.0/Makefile +149 -0
- thunderdots-0.1.0/PKG-INFO +194 -0
- thunderdots-0.1.0/README.md +157 -0
- thunderdots-0.1.0/RELEASE.md +174 -0
- thunderdots-0.1.0/assets/dots-light.png +0 -0
- thunderdots-0.1.0/assets/dots-logo-retro.drawio.png +0 -0
- thunderdots-0.1.0/main.py +459 -0
- thunderdots-0.1.0/mkdocs/docs/api-reference.md +70 -0
- thunderdots-0.1.0/mkdocs/docs/assets/dots-light.png +0 -0
- thunderdots-0.1.0/mkdocs/docs/assets/dots-logo-retro.drawio.png +0 -0
- thunderdots-0.1.0/mkdocs/docs/assets/logo-chartes.png +0 -0
- thunderdots-0.1.0/mkdocs/docs/caching-async.md +38 -0
- thunderdots-0.1.0/mkdocs/docs/configuration.md +256 -0
- thunderdots-0.1.0/mkdocs/docs/exports.md +81 -0
- thunderdots-0.1.0/mkdocs/docs/fragmentation.md +109 -0
- thunderdots-0.1.0/mkdocs/docs/index.md +64 -0
- thunderdots-0.1.0/mkdocs/docs/installation.md +56 -0
- thunderdots-0.1.0/mkdocs/docs/metadata-validation.md +56 -0
- thunderdots-0.1.0/mkdocs/docs/notebooks/artifacts/thunderdots/resources_cache.csv +25 -0
- thunderdots-0.1.0/mkdocs/docs/notebooks/artifacts/thunderdots/results.json +1522 -0
- thunderdots-0.1.0/mkdocs/docs/notebooks/thunderdots_playground.ipynb +4019 -0
- thunderdots-0.1.0/mkdocs/docs/quickstart.md +67 -0
- thunderdots-0.1.0/mkdocs/docs/stylesheets/extra.css +32 -0
- thunderdots-0.1.0/mkdocs/docs/troubleshooting.md +25 -0
- thunderdots-0.1.0/mkdocs/mkdocs.yml +91 -0
- thunderdots-0.1.0/mkdocs/overrides/partials/copyright.html +10 -0
- thunderdots-0.1.0/pyproject.toml +64 -0
- thunderdots-0.1.0/pytest.ini +4 -0
- thunderdots-0.1.0/setup.cfg +4 -0
- thunderdots-0.1.0/tests/conftest.py +109 -0
- thunderdots-0.1.0/tests/fixtures/json/collection_encpos_2025.json +70 -0
- thunderdots-0.1.0/tests/fixtures/json/navigation_encpos_2025_01.json +22 -0
- thunderdots-0.1.0/tests/fixtures/json/resource_encpos_2025_01.json +28 -0
- thunderdots-0.1.0/tests/fixtures/xml/encpos_1893_05.xml +35 -0
- thunderdots-0.1.0/tests/fixtures/xml/smcp_pr_0004.xml +19 -0
- thunderdots-0.1.0/tests/test_client_offline.py +135 -0
- thunderdots-0.1.0/tests/test_exports_and_validation.py +118 -0
- thunderdots-0.1.0/tests/test_fixtures_integrity.py +33 -0
- thunderdots-0.1.0/tests/test_metadata_and_config.py +116 -0
- thunderdots-0.1.0/tests/test_online_dts.py +86 -0
- thunderdots-0.1.0/tests/test_tei_extraction.py +100 -0
- thunderdots-0.1.0/thunderdots/__init__.py +12 -0
- thunderdots-0.1.0/thunderdots/__version__.py +24 -0
- thunderdots-0.1.0/thunderdots/client.py +559 -0
- thunderdots-0.1.0/thunderdots/config.py +204 -0
- thunderdots-0.1.0/thunderdots/extract/resources.py +220 -0
- thunderdots-0.1.0/thunderdots/extract/tei.py +648 -0
- thunderdots-0.1.0/thunderdots/extract/walker.py +139 -0
- thunderdots-0.1.0/thunderdots/fetcher.py +260 -0
- thunderdots-0.1.0/thunderdots/normalize/__init__.py +3 -0
- thunderdots-0.1.0/thunderdots/normalize/dates.py +193 -0
- thunderdots-0.1.0/thunderdots/normalize/metadata.py +130 -0
- thunderdots-0.1.0/thunderdots/normalize/output.py +69 -0
- thunderdots-0.1.0/thunderdots/orm.py +383 -0
- thunderdots-0.1.0/thunderdots/stats.py +52 -0
- thunderdots-0.1.0/thunderdots/ui.py +161 -0
- thunderdots-0.1.0/thunderdots/validation/__init__.py +10 -0
- thunderdots-0.1.0/thunderdots/validation/models.py +61 -0
- thunderdots-0.1.0/thunderdots/validation/rules.py +63 -0
- thunderdots-0.1.0/thunderdots/validation/schemas.py +103 -0
- thunderdots-0.1.0/thunderdots/validation/validators.py +114 -0
- thunderdots-0.1.0/thunderdots.egg-info/PKG-INFO +194 -0
- thunderdots-0.1.0/thunderdots.egg-info/SOURCES.txt +70 -0
- thunderdots-0.1.0/thunderdots.egg-info/dependency_links.txt +1 -0
- thunderdots-0.1.0/thunderdots.egg-info/requires.txt +24 -0
- thunderdots-0.1.0/thunderdots.egg-info/top_level.txt +1 -0
- thunderdots-0.1.0/uv.lock +2911 -0
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
name: CI/CD
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches:
|
|
6
|
+
- "**"
|
|
7
|
+
pull_request:
|
|
8
|
+
branches:
|
|
9
|
+
- "**"
|
|
10
|
+
workflow_dispatch:
|
|
11
|
+
|
|
12
|
+
permissions:
|
|
13
|
+
contents: read
|
|
14
|
+
|
|
15
|
+
concurrency:
|
|
16
|
+
group: ci-${{ github.ref }}
|
|
17
|
+
cancel-in-progress: true
|
|
18
|
+
|
|
19
|
+
jobs:
|
|
20
|
+
python-lint:
|
|
21
|
+
name: Python lint and format
|
|
22
|
+
runs-on: ubuntu-latest
|
|
23
|
+
|
|
24
|
+
steps:
|
|
25
|
+
- name: Checkout
|
|
26
|
+
uses: actions/checkout@v4
|
|
27
|
+
|
|
28
|
+
- name: Setup uv
|
|
29
|
+
uses: astral-sh/setup-uv@v5
|
|
30
|
+
with:
|
|
31
|
+
enable-cache: true
|
|
32
|
+
|
|
33
|
+
- name: Setup Python
|
|
34
|
+
run: uv python install 3.12
|
|
35
|
+
|
|
36
|
+
- name: Install dependencies
|
|
37
|
+
run: uv sync --all-extras --dev
|
|
38
|
+
|
|
39
|
+
- name: Ruff format check
|
|
40
|
+
run: uv run ruff format --check
|
|
41
|
+
|
|
42
|
+
- name: Ruff lint
|
|
43
|
+
run: uv run ruff check
|
|
44
|
+
|
|
45
|
+
python-tests:
|
|
46
|
+
name: Python unit tests
|
|
47
|
+
runs-on: ubuntu-latest
|
|
48
|
+
needs: python-lint
|
|
49
|
+
|
|
50
|
+
steps:
|
|
51
|
+
- name: Checkout
|
|
52
|
+
uses: actions/checkout@v4
|
|
53
|
+
|
|
54
|
+
- name: Setup uv
|
|
55
|
+
uses: astral-sh/setup-uv@v5
|
|
56
|
+
with:
|
|
57
|
+
enable-cache: true
|
|
58
|
+
|
|
59
|
+
- name: Setup Python
|
|
60
|
+
run: uv python install 3.12
|
|
61
|
+
|
|
62
|
+
- name: Install dependencies
|
|
63
|
+
run: uv sync --all-extras --dev
|
|
64
|
+
|
|
65
|
+
- name: Run unit tests
|
|
66
|
+
run: uv run pytest
|
|
67
|
+
|
|
68
|
+
package-build:
|
|
69
|
+
name: Build Python package
|
|
70
|
+
runs-on: ubuntu-latest
|
|
71
|
+
needs: python-tests
|
|
72
|
+
|
|
73
|
+
steps:
|
|
74
|
+
- name: Checkout
|
|
75
|
+
uses: actions/checkout@v4
|
|
76
|
+
with:
|
|
77
|
+
fetch-depth: 0
|
|
78
|
+
|
|
79
|
+
- name: Setup uv
|
|
80
|
+
uses: astral-sh/setup-uv@v5
|
|
81
|
+
with:
|
|
82
|
+
enable-cache: true
|
|
83
|
+
|
|
84
|
+
- name: Setup Python
|
|
85
|
+
run: uv python install 3.12
|
|
86
|
+
|
|
87
|
+
- name: Install dependencies
|
|
88
|
+
run: uv sync --all-extras --dev
|
|
89
|
+
|
|
90
|
+
- name: Build package
|
|
91
|
+
run: uv build
|
|
92
|
+
|
|
93
|
+
- name: Check package metadata
|
|
94
|
+
run: uvx twine check dist/*
|
|
95
|
+
|
|
96
|
+
docs-build:
|
|
97
|
+
name: Build documentation
|
|
98
|
+
runs-on: ubuntu-latest
|
|
99
|
+
needs: python-tests
|
|
100
|
+
|
|
101
|
+
steps:
|
|
102
|
+
- name: Checkout
|
|
103
|
+
uses: actions/checkout@v4
|
|
104
|
+
|
|
105
|
+
- name: Setup uv
|
|
106
|
+
uses: astral-sh/setup-uv@v5
|
|
107
|
+
with:
|
|
108
|
+
enable-cache: true
|
|
109
|
+
|
|
110
|
+
- name: Setup Python
|
|
111
|
+
run: uv python install 3.12
|
|
112
|
+
|
|
113
|
+
- name: Install dependencies
|
|
114
|
+
run: uv sync --all-extras --dev
|
|
115
|
+
|
|
116
|
+
- name: Build MkDocs documentation
|
|
117
|
+
run: uv run mkdocs build --strict -f mkdocs/mkdocs.yml
|
|
118
|
+
|
|
119
|
+
- name: Upload GitHub Pages artifact
|
|
120
|
+
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
|
|
121
|
+
uses: actions/upload-pages-artifact@v3
|
|
122
|
+
with:
|
|
123
|
+
path: mkdocs/site
|
|
124
|
+
|
|
125
|
+
docs-deploy:
|
|
126
|
+
name: Deploy documentation to GitHub Pages
|
|
127
|
+
runs-on: ubuntu-latest
|
|
128
|
+
needs: docs-build
|
|
129
|
+
|
|
130
|
+
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
|
|
131
|
+
|
|
132
|
+
permissions:
|
|
133
|
+
contents: read
|
|
134
|
+
pages: write
|
|
135
|
+
id-token: write
|
|
136
|
+
|
|
137
|
+
environment:
|
|
138
|
+
name: github-pages
|
|
139
|
+
url: ${{ steps.deployment.outputs.page_url }}
|
|
140
|
+
|
|
141
|
+
steps:
|
|
142
|
+
- name: Deploy to GitHub Pages
|
|
143
|
+
id: deployment
|
|
144
|
+
uses: actions/deploy-pages@v4
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
name: Release to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types:
|
|
6
|
+
- published
|
|
7
|
+
|
|
8
|
+
permissions:
|
|
9
|
+
contents: read
|
|
10
|
+
id-token: write
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
pypi-release:
|
|
14
|
+
name: Build and publish to PyPI
|
|
15
|
+
runs-on: ubuntu-latest
|
|
16
|
+
|
|
17
|
+
environment:
|
|
18
|
+
name: pypi
|
|
19
|
+
url: https://pypi.org/project/thunderdots/
|
|
20
|
+
|
|
21
|
+
steps:
|
|
22
|
+
- name: Checkout
|
|
23
|
+
uses: actions/checkout@v4
|
|
24
|
+
with:
|
|
25
|
+
fetch-depth: 0
|
|
26
|
+
|
|
27
|
+
- name: Setup uv
|
|
28
|
+
uses: astral-sh/setup-uv@v5
|
|
29
|
+
with:
|
|
30
|
+
enable-cache: true
|
|
31
|
+
|
|
32
|
+
- name: Setup Python
|
|
33
|
+
run: uv python install 3.12
|
|
34
|
+
|
|
35
|
+
- name: Install dependencies
|
|
36
|
+
run: uv sync --all-extras --dev
|
|
37
|
+
|
|
38
|
+
- name: Check release tag format
|
|
39
|
+
run: |
|
|
40
|
+
uv run python - <<'PY'
|
|
41
|
+
import os
|
|
42
|
+
import re
|
|
43
|
+
|
|
44
|
+
tag = os.environ["GITHUB_REF_NAME"]
|
|
45
|
+
|
|
46
|
+
if not re.fullmatch(r"v\d+\.\d+\.\d+([a-zA-Z0-9.\-]+)?", tag):
|
|
47
|
+
raise SystemExit(
|
|
48
|
+
f"Invalid release tag '{tag}'. Expected something like v0.1.0"
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
print(f"Release tag format OK: {tag}")
|
|
52
|
+
PY
|
|
53
|
+
|
|
54
|
+
- name: Build package
|
|
55
|
+
run: uv build
|
|
56
|
+
|
|
57
|
+
- name: Check built package version
|
|
58
|
+
run: |
|
|
59
|
+
uv run python - <<'PY'
|
|
60
|
+
import os
|
|
61
|
+
import zipfile
|
|
62
|
+
from pathlib import Path
|
|
63
|
+
from email.parser import Parser
|
|
64
|
+
|
|
65
|
+
tag = os.environ["GITHUB_REF_NAME"]
|
|
66
|
+
expected = tag[1:] if tag.startswith("v") else tag
|
|
67
|
+
|
|
68
|
+
wheels = list(Path("dist").glob("*.whl"))
|
|
69
|
+
if not wheels:
|
|
70
|
+
raise SystemExit("No wheel found in dist/")
|
|
71
|
+
|
|
72
|
+
wheel = wheels[0]
|
|
73
|
+
with zipfile.ZipFile(wheel) as zf:
|
|
74
|
+
metadata_name = next(
|
|
75
|
+
name for name in zf.namelist()
|
|
76
|
+
if name.endswith(".dist-info/METADATA")
|
|
77
|
+
)
|
|
78
|
+
metadata = Parser().parsestr(
|
|
79
|
+
zf.read(metadata_name).decode("utf-8")
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
actual = metadata["Version"]
|
|
83
|
+
|
|
84
|
+
if actual != expected:
|
|
85
|
+
raise SystemExit(
|
|
86
|
+
f"Built package version '{actual}' does not match release tag '{tag}'"
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
print(f"Built package version OK: {actual}")
|
|
90
|
+
PY
|
|
91
|
+
|
|
92
|
+
- name: Check package metadata
|
|
93
|
+
run: uvx twine check dist/*
|
|
94
|
+
|
|
95
|
+
- name: Publish package to PyPI
|
|
96
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
|
|
3
|
+
.venv/
|
|
4
|
+
venv/
|
|
5
|
+
env/
|
|
6
|
+
ENV/
|
|
7
|
+
env.bak/
|
|
8
|
+
venv.bak/
|
|
9
|
+
__pycache__/
|
|
10
|
+
|
|
11
|
+
.cache/
|
|
12
|
+
|
|
13
|
+
# Jupyter Notebook
|
|
14
|
+
.ipynb_checkpoints/
|
|
15
|
+
|
|
16
|
+
# VS Code
|
|
17
|
+
.vscode/
|
|
18
|
+
|
|
19
|
+
# PyCharm
|
|
20
|
+
.idea/
|
|
21
|
+
|
|
22
|
+
# MacOS
|
|
23
|
+
.DS_Store
|
|
24
|
+
|
|
25
|
+
# scripts
|
|
26
|
+
project_dump.txt
|
|
27
|
+
dump_project.py
|
|
28
|
+
specs
|
|
29
|
+
out_results/
|
|
30
|
+
thunderdots/native/build/*mkdocs/site/
|
|
31
|
+
site/
|
|
32
|
+
|
|
33
|
+
*.egg-info/
|
|
34
|
+
__version__.py
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
cff-version: 1.2.0
|
|
2
|
+
message: "If you use this software, please cite it using the metadata from this file."
|
|
3
|
+
title: "ThunderDots"
|
|
4
|
+
abstract: >
|
|
5
|
+
A Python client for Distributed Text Services (DTS) endpoints, initially built
|
|
6
|
+
for DoTS. ThunderDots walks DTS collections and subcollections, fetches
|
|
7
|
+
resources and TEI/XML documents, extracts text fragments from full documents,
|
|
8
|
+
DTS navigation, or custom TEI XPath rules, filters Dublin Core and extension
|
|
9
|
+
metadata, validates generated outputs, and exports records for indexing,
|
|
10
|
+
full-text search, RAG, vector databases, or corpus-analysis pipelines.
|
|
11
|
+
type: software
|
|
12
|
+
authors:
|
|
13
|
+
- family-names: Terriel
|
|
14
|
+
given-names: Lucas
|
|
15
|
+
orcid: "https://orcid.org/0000-0002-9189-258X"
|
|
16
|
+
affiliation: "École nationale des chartes – PSL"
|
|
17
|
+
repository-code: "https://github.com/chartes/thunderdots"
|
|
18
|
+
url: "https://github.com/chartes/thunderdots"
|
|
19
|
+
license: MIT
|
|
20
|
+
keywords:
|
|
21
|
+
- DTS
|
|
22
|
+
- Distributed Text Services
|
|
23
|
+
- TEI
|
|
24
|
+
- XML
|
|
25
|
+
- text extraction
|
|
26
|
+
- digital humanities
|
|
27
|
+
- cultural heritage
|
|
28
|
+
- corpus analysis
|
|
29
|
+
- full-text search
|
|
30
|
+
- RAG
|
|
31
|
+
- Elasticsearch
|
|
32
|
+
- Qdrant
|
|
33
|
+
- Python
|
|
34
|
+
version: "0.1.0"
|
|
35
|
+
date-released: "2026-01-01"
|
|
36
|
+
preferred-citation:
|
|
37
|
+
type: software
|
|
38
|
+
title: "ThunderDots"
|
|
39
|
+
authors:
|
|
40
|
+
- family-names: Terriel
|
|
41
|
+
given-names: Lucas
|
|
42
|
+
orcid: "https://orcid.org/0000-0002-9189-258X"
|
|
43
|
+
affiliation: "École nationale des chartes – PSL"
|
|
44
|
+
repository-code: "https://github.com/chartes/thunderdots"
|
|
45
|
+
url: "https://github.com/chartes/thunderdots"
|
|
46
|
+
license: MIT
|
|
47
|
+
date-released: "2026-01-01"
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 ThunderDots contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
.PHONY: help \
|
|
2
|
+
dev.sync dev.install \
|
|
3
|
+
docs.prepare docs.serve docs.build docs.build-dev docs.clean docs.check docs.deploy \
|
|
4
|
+
tests.unit tests.network tests.all \
|
|
5
|
+
lint.check lint.format \
|
|
6
|
+
check clean
|
|
7
|
+
|
|
8
|
+
# ---------------------------------------------------------------------
|
|
9
|
+
# Tooling
|
|
10
|
+
# ---------------------------------------------------------------------
|
|
11
|
+
|
|
12
|
+
UV ?= uv
|
|
13
|
+
PYTHON ?= $(UV) run python
|
|
14
|
+
PYTEST ?= $(PYTHON) -m pytest
|
|
15
|
+
RUFF ?= $(PYTHON) -m ruff
|
|
16
|
+
MKDOCS ?= $(PYTHON) -m mkdocs
|
|
17
|
+
|
|
18
|
+
# ---------------------------------------------------------------------
|
|
19
|
+
# Paths
|
|
20
|
+
# ---------------------------------------------------------------------
|
|
21
|
+
|
|
22
|
+
DOCS_DIR ?= mkdocs
|
|
23
|
+
DOCS_CONFIG ?= $(DOCS_DIR)/mkdocs.yml
|
|
24
|
+
DOCS_SRC ?= $(DOCS_DIR)/docs
|
|
25
|
+
DOCS_SITE ?= $(DOCS_DIR)/site
|
|
26
|
+
|
|
27
|
+
ROOT_ASSETS_DIR ?= assets
|
|
28
|
+
DOCS_ASSETS_DIR ?= $(DOCS_SRC)/assets
|
|
29
|
+
|
|
30
|
+
NOTEBOOKS_DIR ?= notebooks
|
|
31
|
+
DOCS_NOTEBOOKS_DIR ?= $(DOCS_SRC)/notebooks
|
|
32
|
+
USER_NOTEBOOK ?= thunderdots_documentation_utilisateur.ipynb
|
|
33
|
+
|
|
34
|
+
# ---------------------------------------------------------------------
|
|
35
|
+
# Help
|
|
36
|
+
# ---------------------------------------------------------------------
|
|
37
|
+
|
|
38
|
+
help:
|
|
39
|
+
@echo "ThunderDots development commands"
|
|
40
|
+
@echo ""
|
|
41
|
+
@echo "Environment:"
|
|
42
|
+
@echo " make dev.sync Sync project dependencies with uv"
|
|
43
|
+
@echo " make dev.install Install project with dev and docs extras"
|
|
44
|
+
@echo ""
|
|
45
|
+
@echo "Documentation:"
|
|
46
|
+
@echo " make docs.prepare Copy assets and notebooks into MkDocs docs_dir"
|
|
47
|
+
@echo " make docs.serve Serve MkDocs locally"
|
|
48
|
+
@echo " make docs.build Build documentation in strict mode"
|
|
49
|
+
@echo " make docs.build-dev Build documentation without strict mode"
|
|
50
|
+
@echo " make docs.check Validate documentation build"
|
|
51
|
+
@echo " make docs.deploy Deploy documentation to gh-pages"
|
|
52
|
+
@echo " make docs.clean Remove generated documentation site"
|
|
53
|
+
@echo ""
|
|
54
|
+
@echo "Tests:"
|
|
55
|
+
@echo " make tests.unit Run unit tests without network tests"
|
|
56
|
+
@echo " make tests.network Run tests with online DTS tests"
|
|
57
|
+
@echo " make tests.all Run all tests"
|
|
58
|
+
@echo ""
|
|
59
|
+
@echo "Linting:"
|
|
60
|
+
@echo " make lint.check Run Ruff checks"
|
|
61
|
+
@echo " make lint.format Format code with Ruff"
|
|
62
|
+
@echo ""
|
|
63
|
+
@echo "Global:"
|
|
64
|
+
@echo " make check Run lint + tests + docs check"
|
|
65
|
+
@echo " make clean Remove generated caches and build artefacts"
|
|
66
|
+
|
|
67
|
+
# ---------------------------------------------------------------------
|
|
68
|
+
# Environment
|
|
69
|
+
# ---------------------------------------------------------------------
|
|
70
|
+
|
|
71
|
+
dev.sync:
|
|
72
|
+
$(UV) sync --extra dev --extra docs
|
|
73
|
+
|
|
74
|
+
dev.install:
|
|
75
|
+
$(UV) pip install -e ".[dev,docs]"
|
|
76
|
+
|
|
77
|
+
# ---------------------------------------------------------------------
|
|
78
|
+
# Documentation
|
|
79
|
+
# ---------------------------------------------------------------------
|
|
80
|
+
|
|
81
|
+
docs.prepare:
|
|
82
|
+
mkdir -p $(DOCS_ASSETS_DIR)
|
|
83
|
+
mkdir -p $(DOCS_NOTEBOOKS_DIR)
|
|
84
|
+
@if [ -d "$(ROOT_ASSETS_DIR)" ]; then \
|
|
85
|
+
cp -R $(ROOT_ASSETS_DIR)/* $(DOCS_ASSETS_DIR)/ 2>/dev/null || true; \
|
|
86
|
+
fi
|
|
87
|
+
@if [ -f "$(NOTEBOOKS_DIR)/$(USER_NOTEBOOK)" ]; then \
|
|
88
|
+
cp "$(NOTEBOOKS_DIR)/$(USER_NOTEBOOK)" "$(DOCS_NOTEBOOKS_DIR)/$(USER_NOTEBOOK)"; \
|
|
89
|
+
fi
|
|
90
|
+
|
|
91
|
+
docs.serve: docs.prepare
|
|
92
|
+
$(MKDOCS) serve -f $(DOCS_CONFIG)
|
|
93
|
+
|
|
94
|
+
docs.build: docs.prepare
|
|
95
|
+
$(MKDOCS) build -f $(DOCS_CONFIG) --strict
|
|
96
|
+
|
|
97
|
+
docs.build-dev: docs.prepare
|
|
98
|
+
$(MKDOCS) build -f $(DOCS_CONFIG)
|
|
99
|
+
|
|
100
|
+
docs.clean:
|
|
101
|
+
rm -rf $(DOCS_SITE)
|
|
102
|
+
|
|
103
|
+
docs.check: docs.build
|
|
104
|
+
|
|
105
|
+
docs.deploy: docs.prepare
|
|
106
|
+
$(MKDOCS) gh-deploy -f $(DOCS_CONFIG) --force
|
|
107
|
+
|
|
108
|
+
# ---------------------------------------------------------------------
|
|
109
|
+
# Tests
|
|
110
|
+
# ---------------------------------------------------------------------
|
|
111
|
+
|
|
112
|
+
tests.unit:
|
|
113
|
+
$(PYTEST)
|
|
114
|
+
|
|
115
|
+
tests.network:
|
|
116
|
+
RUN_NETWORK_TESTS=1 $(PYTEST)
|
|
117
|
+
|
|
118
|
+
tests.all: tests.network
|
|
119
|
+
|
|
120
|
+
# ---------------------------------------------------------------------
|
|
121
|
+
# Linting / formatting
|
|
122
|
+
# ---------------------------------------------------------------------
|
|
123
|
+
|
|
124
|
+
lint.check:
|
|
125
|
+
$(RUFF) check thunderdots tests
|
|
126
|
+
|
|
127
|
+
lint.format:
|
|
128
|
+
$(RUFF) format thunderdots tests
|
|
129
|
+
$(RUFF) check thunderdots tests --fix
|
|
130
|
+
|
|
131
|
+
# ---------------------------------------------------------------------
|
|
132
|
+
# Global checks
|
|
133
|
+
# ---------------------------------------------------------------------
|
|
134
|
+
|
|
135
|
+
check: lint.check tests.unit docs.check
|
|
136
|
+
|
|
137
|
+
# ---------------------------------------------------------------------
|
|
138
|
+
# Cleaning
|
|
139
|
+
# ---------------------------------------------------------------------
|
|
140
|
+
|
|
141
|
+
clean: docs.clean
|
|
142
|
+
rm -rf .pytest_cache
|
|
143
|
+
rm -rf .ruff_cache
|
|
144
|
+
rm -rf htmlcov
|
|
145
|
+
rm -rf dist
|
|
146
|
+
rm -rf build
|
|
147
|
+
rm -rf *.egg-info
|
|
148
|
+
find . -type d -name "__pycache__" -prune -exec rm -rf {} +
|
|
149
|
+
find . -type f -name "*.pyc" -delete
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: thunderdots
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: ThunderDots is a Python client for DTS (Distributed Text Services) endpoints, initially built for DoTS.
|
|
5
|
+
Author: Lucas Terriel, École nationale des chartes - PSL
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/chartes/thunderdots
|
|
8
|
+
Project-URL: Repository, https://github.com/chartes/thunderdots
|
|
9
|
+
Project-URL: Documentation, https://chartes.github.io/thunderdots/
|
|
10
|
+
Project-URL: Issues, https://github.com/chartes/thunderdots/issues
|
|
11
|
+
Keywords: dts,tei,xml,digital-humanities,corpus,dots
|
|
12
|
+
Requires-Python: >=3.10
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE.md
|
|
15
|
+
Requires-Dist: edtf>=5.0.1
|
|
16
|
+
Requires-Dist: h2>=4.3.0
|
|
17
|
+
Requires-Dist: httpx[http2]>=0.28.1
|
|
18
|
+
Requires-Dist: jsonschema>=4.26.0
|
|
19
|
+
Requires-Dist: lxml>=6.0.2
|
|
20
|
+
Requires-Dist: notebook>=7.5.6
|
|
21
|
+
Requires-Dist: rich>=14.3.2
|
|
22
|
+
Requires-Dist: tqdm>=4.67.3
|
|
23
|
+
Provides-Extra: dev
|
|
24
|
+
Requires-Dist: pytest>=9.0.2; extra == "dev"
|
|
25
|
+
Requires-Dist: ruff>=0.15.1; extra == "dev"
|
|
26
|
+
Requires-Dist: build; extra == "dev"
|
|
27
|
+
Requires-Dist: twine; extra == "dev"
|
|
28
|
+
Provides-Extra: docs
|
|
29
|
+
Requires-Dist: mkdocs>=1.6.1; extra == "docs"
|
|
30
|
+
Requires-Dist: mkdocs-material>=9.5; extra == "docs"
|
|
31
|
+
Requires-Dist: mkdocs-jupyter>=0.25; extra == "docs"
|
|
32
|
+
Requires-Dist: jupyter; extra == "docs"
|
|
33
|
+
Requires-Dist: ipykernel; extra == "docs"
|
|
34
|
+
Provides-Extra: all
|
|
35
|
+
Requires-Dist: thunderdots[dev,docs]; extra == "all"
|
|
36
|
+
Dynamic: license-file
|
|
37
|
+
|
|
38
|
+
<h1 align="center">
|
|
39
|
+
<img src="assets/dots-light.png" width="450"><br>
|
|
40
|
+
ThunderDots — DTS client for documentary corpora
|
|
41
|
+
</h1>
|
|
42
|
+
|
|
43
|
+
<p align="center">
|
|
44
|
+
<strong>Fast DTS crawling, TEI fragmentation, metadata filtering, validation, and export pipelines.</strong>
|
|
45
|
+
</p>
|
|
46
|
+
|
|
47
|
+
<p align="center">
|
|
48
|
+
<a href="https://github.com/astral-sh/uv">
|
|
49
|
+
<img src="https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/uv/main/assets/badge/v0.json" alt="uv">
|
|
50
|
+
</a>
|
|
51
|
+
<a href="https://github.com/astral-sh/ruff">
|
|
52
|
+
<img src="https://img.shields.io/badge/lint-ruff-0A0A0A?logo=ruff&logoColor=white" alt="ruff">
|
|
53
|
+
</a>
|
|
54
|
+
<a href="https://github.com/chartes/thunderdots/actions/workflows/ci.yml">
|
|
55
|
+
<img src="https://github.com/chartes/thunderdots/actions/workflows/ci.yml/badge.svg" alt="CI">
|
|
56
|
+
</a>
|
|
57
|
+
<a href="https://github.com/chartes/thunderdots/blob/master/LICENSE.md">
|
|
58
|
+
<img src="https://img.shields.io/badge/License-MIT-blue.svg" alt="License: MIT">
|
|
59
|
+
</a>
|
|
60
|
+
<a href="https://pypi.org/project/thunderdots/">
|
|
61
|
+
<img src="https://img.shields.io/pypi/v/thunderdots.svg" alt="PyPI version">
|
|
62
|
+
</a>
|
|
63
|
+
</p>
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## Overview
|
|
68
|
+
|
|
69
|
+
**ThunderDots** is a Python client for [DTS](https://dtsapi.org/specifications/) (*Distributed Text Services*) endpoints, initially built for [DoTS](https://chartes.github.io/dots_documentation/).
|
|
70
|
+
|
|
71
|
+
It helps you move from a remote DTS API to structured Python objects and JSON records that can feed indexing pipelines, including full-text search, RAG/vector databases, and corpus-analysis workflows.
|
|
72
|
+
|
|
73
|
+
ThunderDots focuses on practical documentary workflows: crawling DTS collections, fetching TEI/XML resources, extracting reusable text fragments, selecting metadata, validating outputs, and exporting data to downstream search or indexing systems.
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## What ThunderDots does
|
|
78
|
+
|
|
79
|
+
ThunderDots can:
|
|
80
|
+
|
|
81
|
+
- walk DTS collections and subcollections;
|
|
82
|
+
- fetch resources and TEI/XML documents;
|
|
83
|
+
- extract text fragments from full documents, DTS navigation, or custom TEI XPath rules;
|
|
84
|
+
- preserve or filter Dublin Core and extension metadata;
|
|
85
|
+
- enrich temporal metadata such as dates and coverage ranges;
|
|
86
|
+
- validate generated outputs with JSON Schema;
|
|
87
|
+
- export records to indexing pipelines such as Elasticsearch or Qdrant-compatible formats;
|
|
88
|
+
- cache fetched corpora as JSON and CSV;
|
|
89
|
+
- run synchronous or asynchronous workflows.
|
|
90
|
+
|
|
91
|
+
---
|
|
92
|
+
|
|
93
|
+
## Installation
|
|
94
|
+
|
|
95
|
+
### With `uv`
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
uv add thunderdots
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### With pip
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
pip install thunderdots
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
### For development
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
git clone https://github.com/chartes/thunderdots.git
|
|
111
|
+
cd thunderdots
|
|
112
|
+
|
|
113
|
+
uv venv
|
|
114
|
+
source .venv/bin/activate
|
|
115
|
+
uv sync --all-extras --dev
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
or with pip
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
python -m venv .venv
|
|
122
|
+
source .venv/bin/activate
|
|
123
|
+
pip install -e ".[dev]"
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
## Minimal example
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
from thunderdots import ThunderDots
|
|
130
|
+
|
|
131
|
+
td = ThunderDots(
|
|
132
|
+
endpoint_dts="https://dots.chartes.psl.eu/api/dts",
|
|
133
|
+
collection_params={"collection_id": "ENCPOS_1900"},
|
|
134
|
+
resource_params={"fragment_mode": "document"},
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
td.fetch()
|
|
138
|
+
results = td.results()
|
|
139
|
+
|
|
140
|
+
print(td.stats())
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
## Development
|
|
144
|
+
|
|
145
|
+
### Run tests
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
pytest
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
Online DTS tests are opt-in:
|
|
152
|
+
|
|
153
|
+
```bash
|
|
154
|
+
RUN_NETWORK_TESTS=1 pytest
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
### Run Ruff (linter, format)
|
|
158
|
+
|
|
159
|
+
```bash
|
|
160
|
+
ruff format --check
|
|
161
|
+
ruff check
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
### Build the documentation
|
|
165
|
+
|
|
166
|
+
```bash
|
|
167
|
+
mkdocs build --strict -f mkdocs/mkdocs.yml
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
### Create a new PyPI release
|
|
171
|
+
|
|
172
|
+
Check the [release checklist](./RELEASE.md) for details.
|
|
173
|
+
|
|
174
|
+
### License
|
|
175
|
+
|
|
176
|
+
ThunderDots is distributed under the [MIT License](./LICENSE.md).
|
|
177
|
+
|
|
178
|
+
### Citation
|
|
179
|
+
|
|
180
|
+
If you use ThunderDots in academic work, please cite it as:
|
|
181
|
+
|
|
182
|
+
```
|
|
183
|
+
@software{terriel_thunderdots_2026,
|
|
184
|
+
author = {Terriel, Lucas},
|
|
185
|
+
title = {ThunderDots},
|
|
186
|
+
year = {2026},
|
|
187
|
+
publisher = {GitHub},
|
|
188
|
+
institution = {{École nationale des chartes}},
|
|
189
|
+
url = {https://github.com/chartes/thunderdots},
|
|
190
|
+
note = {Python client for Distributed Text Services (DTS) via DoTS}
|
|
191
|
+
}
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
You can also use the repository metadata from [CITATION.cff](./CITATION.cff).
|