tprdb-utilities 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tprdb_utilities-0.1.0/.gitignore +16 -0
- tprdb_utilities-0.1.0/LICENSE +21 -0
- tprdb_utilities-0.1.0/PKG-INFO +166 -0
- tprdb_utilities-0.1.0/README.md +152 -0
- tprdb_utilities-0.1.0/pyproject.toml +25 -0
- tprdb_utilities-0.1.0/src/tprdb_utilities/__init__.py +4 -0
- tprdb_utilities-0.1.0/src/tprdb_utilities/fetcher.py +194 -0
- tprdb_utilities-0.1.0/src/tprdb_utilities/reader.py +153 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Critt-Kent
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tprdb-utilities
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A toolkit for accessing and working with data from the CRITT Translation Process Research Database (TPR-DB).
|
|
5
|
+
Project-URL: Homepage, https://github.com/Critt-Kent/tprdb-utilities
|
|
6
|
+
Project-URL: Issues, https://github.com/Critt-Kent/tprdb-utilities/issues
|
|
7
|
+
Author-email: Devin Gilbert <DevinG@uvu.edu>
|
|
8
|
+
License: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Python: >=3.9
|
|
11
|
+
Requires-Dist: pandas
|
|
12
|
+
Requires-Dist: requests
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
# tprdb-utilities
|
|
16
|
+
|
|
17
|
+
[](https://pypi.org/project/tprdb-utilities/)
|
|
18
|
+
[](https://pypi.org/project/tprdb-utilities/)
|
|
19
|
+
[](https://opensource.org/licenses/MIT)
|
|
20
|
+
|
|
21
|
+
A Python toolkit for downloading and reading data tables from the
|
|
22
|
+
[CRITT Translation Process Research Database (TPR-DB)](https://critt.as.kent.edu/tpr/).
|
|
23
|
+
|
|
24
|
+
Two functions cover the full workflow:
|
|
25
|
+
|
|
26
|
+
| Function | What it does |
|
|
27
|
+
|---|---|
|
|
28
|
+
| `fetch_TPRDB_tables` | Downloads study tables from the CRITT API and saves them to a local directory structure |
|
|
29
|
+
| `read_TPRDB_tables` | Reads those tables (locally or on the CRITT server) into a single `pandas.DataFrame` |
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
# pip
|
|
37
|
+
pip install tprdb-utilities
|
|
38
|
+
|
|
39
|
+
# uv
|
|
40
|
+
uv add tprdb-utilities
|
|
41
|
+
|
|
42
|
+
# poetry
|
|
43
|
+
poetry add tprdb-utilities
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
## Quick Start
|
|
49
|
+
|
|
50
|
+
### 1 — Download data (fetcher)
|
|
51
|
+
|
|
52
|
+
**Public study** (no credentials needed):
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
from tprdb_utilities import fetch_TPRDB_tables
|
|
56
|
+
|
|
57
|
+
fetch_TPRDB_tables(
|
|
58
|
+
path="/path/to/local/data",
|
|
59
|
+
StudyID="DG21",
|
|
60
|
+
extension=["kd", "ss"],
|
|
61
|
+
public=True,
|
|
62
|
+
)
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
**Private study** (requires your TPR-DB username and API token):
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
from tprdb_utilities import fetch_TPRDB_tables
|
|
69
|
+
|
|
70
|
+
fetch_TPRDB_tables(
|
|
71
|
+
path="/path/to/local/data",
|
|
72
|
+
StudyID="MYSTUDY",
|
|
73
|
+
extension=["kd"],
|
|
74
|
+
public=False,
|
|
75
|
+
username="myTPRDBusername", # case-sensitive, must match your account
|
|
76
|
+
token="my-api-token",
|
|
77
|
+
)
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
After downloading, the function always prints a summary like this:
|
|
81
|
+
|
|
82
|
+
```
|
|
83
|
+
=== fetch_TPRDB_tables Summary ===
|
|
84
|
+
StudyID : DG21
|
|
85
|
+
Clone dir: /path/to/local/data/tprdb-mothership-clone
|
|
86
|
+
User dir : TPRDB
|
|
87
|
+
|
|
88
|
+
Extension Status Time
|
|
89
|
+
--------- ---------- ------
|
|
90
|
+
kd Downloaded 1.23s
|
|
91
|
+
ss Downloaded 0.98s
|
|
92
|
+
|
|
93
|
+
To read these files with read_TPRDB_tables:
|
|
94
|
+
path = "/path/to/local/data/tprdb-mothership-clone"
|
|
95
|
+
user = "TPRDB"
|
|
96
|
+
studies = ["DG21"]
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Copy those argument values directly into `read_TPRDB_tables`.
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
### 2 — Read data (reader)
|
|
104
|
+
|
|
105
|
+
**From a local clone** (`mothership=False`) — after running `fetch_TPRDB_tables`:
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
from tprdb_utilities import read_TPRDB_tables
|
|
109
|
+
|
|
110
|
+
df = read_TPRDB_tables(
|
|
111
|
+
studies=["DG21", "AR22"],
|
|
112
|
+
extension="kd",
|
|
113
|
+
mothership=False,
|
|
114
|
+
path="/path/to/local/data/tprdb-mothership-clone",
|
|
115
|
+
user="TPRDB",
|
|
116
|
+
)
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
**Directly on the CRITT TPR-DB server** (`mothership=True`):
|
|
120
|
+
|
|
121
|
+
```python
|
|
122
|
+
from tprdb_utilities import read_TPRDB_tables
|
|
123
|
+
|
|
124
|
+
df = read_TPRDB_tables(
|
|
125
|
+
studies=["DG21", "AR22"],
|
|
126
|
+
extension="kd",
|
|
127
|
+
mothership=True, # path is set automatically; no path argument needed
|
|
128
|
+
)
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
---
|
|
132
|
+
|
|
133
|
+
## Directory Structure
|
|
134
|
+
|
|
135
|
+
`fetch_TPRDB_tables` creates the following layout under `path`:
|
|
136
|
+
|
|
137
|
+
```
|
|
138
|
+
<path>/
|
|
139
|
+
└── tprdb-mothership-clone/
|
|
140
|
+
├── TPRDB/ ← public studies
|
|
141
|
+
│ └── <StudyID>/
|
|
142
|
+
│ └── Tables/
|
|
143
|
+
│ ├── session1.kd
|
|
144
|
+
│ └── ...
|
|
145
|
+
└── <username>/ ← private studies
|
|
146
|
+
└── <StudyID>/
|
|
147
|
+
└── Tables/
|
|
148
|
+
├── session1.kd
|
|
149
|
+
└── ...
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
`read_TPRDB_tables` with `mothership=False` expects this exact layout, so the
|
|
153
|
+
two functions are designed to work together seamlessly.
|
|
154
|
+
|
|
155
|
+
---
|
|
156
|
+
|
|
157
|
+
## Supported Table Extensions
|
|
158
|
+
|
|
159
|
+
`ss`, `sg`, `st`, `tt`, `kd`, `fd`, `au`, `pu`, `hof`, `pol`
|
|
160
|
+
|
|
161
|
+
---
|
|
162
|
+
|
|
163
|
+
## License
|
|
164
|
+
|
|
165
|
+
MIT — see [LICENSE](LICENSE).
|
|
166
|
+
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
# tprdb-utilities
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/tprdb-utilities/)
|
|
4
|
+
[](https://pypi.org/project/tprdb-utilities/)
|
|
5
|
+
[](https://opensource.org/licenses/MIT)
|
|
6
|
+
|
|
7
|
+
A Python toolkit for downloading and reading data tables from the
|
|
8
|
+
[CRITT Translation Process Research Database (TPR-DB)](https://critt.as.kent.edu/tpr/).
|
|
9
|
+
|
|
10
|
+
Two functions cover the full workflow:
|
|
11
|
+
|
|
12
|
+
| Function | What it does |
|
|
13
|
+
|---|---|
|
|
14
|
+
| `fetch_TPRDB_tables` | Downloads study tables from the CRITT API and saves them to a local directory structure |
|
|
15
|
+
| `read_TPRDB_tables` | Reads those tables (locally or on the CRITT server) into a single `pandas.DataFrame` |
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## Installation
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
# pip
|
|
23
|
+
pip install tprdb-utilities
|
|
24
|
+
|
|
25
|
+
# uv
|
|
26
|
+
uv add tprdb-utilities
|
|
27
|
+
|
|
28
|
+
# poetry
|
|
29
|
+
poetry add tprdb-utilities
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
## Quick Start
|
|
35
|
+
|
|
36
|
+
### 1 — Download data (fetcher)
|
|
37
|
+
|
|
38
|
+
**Public study** (no credentials needed):
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from tprdb_utilities import fetch_TPRDB_tables
|
|
42
|
+
|
|
43
|
+
fetch_TPRDB_tables(
|
|
44
|
+
path="/path/to/local/data",
|
|
45
|
+
StudyID="DG21",
|
|
46
|
+
extension=["kd", "ss"],
|
|
47
|
+
public=True,
|
|
48
|
+
)
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
**Private study** (requires your TPR-DB username and API token):
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
from tprdb_utilities import fetch_TPRDB_tables
|
|
55
|
+
|
|
56
|
+
fetch_TPRDB_tables(
|
|
57
|
+
path="/path/to/local/data",
|
|
58
|
+
StudyID="MYSTUDY",
|
|
59
|
+
extension=["kd"],
|
|
60
|
+
public=False,
|
|
61
|
+
username="myTPRDBusername", # case-sensitive, must match your account
|
|
62
|
+
token="my-api-token",
|
|
63
|
+
)
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
After downloading, the function always prints a summary like this:
|
|
67
|
+
|
|
68
|
+
```
|
|
69
|
+
=== fetch_TPRDB_tables Summary ===
|
|
70
|
+
StudyID : DG21
|
|
71
|
+
Clone dir: /path/to/local/data/tprdb-mothership-clone
|
|
72
|
+
User dir : TPRDB
|
|
73
|
+
|
|
74
|
+
Extension Status Time
|
|
75
|
+
--------- ---------- ------
|
|
76
|
+
kd Downloaded 1.23s
|
|
77
|
+
ss Downloaded 0.98s
|
|
78
|
+
|
|
79
|
+
To read these files with read_TPRDB_tables:
|
|
80
|
+
path = "/path/to/local/data/tprdb-mothership-clone"
|
|
81
|
+
user = "TPRDB"
|
|
82
|
+
studies = ["DG21"]
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Copy those argument values directly into `read_TPRDB_tables`.
|
|
86
|
+
|
|
87
|
+
---
|
|
88
|
+
|
|
89
|
+
### 2 — Read data (reader)
|
|
90
|
+
|
|
91
|
+
**From a local clone** (`mothership=False`) — after running `fetch_TPRDB_tables`:
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
from tprdb_utilities import read_TPRDB_tables
|
|
95
|
+
|
|
96
|
+
df = read_TPRDB_tables(
|
|
97
|
+
studies=["DG21", "AR22"],
|
|
98
|
+
extension="kd",
|
|
99
|
+
mothership=False,
|
|
100
|
+
path="/path/to/local/data/tprdb-mothership-clone",
|
|
101
|
+
user="TPRDB",
|
|
102
|
+
)
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
**Directly on the CRITT TPR-DB server** (`mothership=True`):
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
from tprdb_utilities import read_TPRDB_tables
|
|
109
|
+
|
|
110
|
+
df = read_TPRDB_tables(
|
|
111
|
+
studies=["DG21", "AR22"],
|
|
112
|
+
extension="kd",
|
|
113
|
+
mothership=True, # path is set automatically; no path argument needed
|
|
114
|
+
)
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
---
|
|
118
|
+
|
|
119
|
+
## Directory Structure
|
|
120
|
+
|
|
121
|
+
`fetch_TPRDB_tables` creates the following layout under `path`:
|
|
122
|
+
|
|
123
|
+
```
|
|
124
|
+
<path>/
|
|
125
|
+
└── tprdb-mothership-clone/
|
|
126
|
+
├── TPRDB/ ← public studies
|
|
127
|
+
│ └── <StudyID>/
|
|
128
|
+
│ └── Tables/
|
|
129
|
+
│ ├── session1.kd
|
|
130
|
+
│ └── ...
|
|
131
|
+
└── <username>/ ← private studies
|
|
132
|
+
└── <StudyID>/
|
|
133
|
+
└── Tables/
|
|
134
|
+
├── session1.kd
|
|
135
|
+
└── ...
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
`read_TPRDB_tables` with `mothership=False` expects this exact layout, so the
|
|
139
|
+
two functions are designed to work together seamlessly.
|
|
140
|
+
|
|
141
|
+
---
|
|
142
|
+
|
|
143
|
+
## Supported Table Extensions
|
|
144
|
+
|
|
145
|
+
`ss`, `sg`, `st`, `tt`, `kd`, `fd`, `au`, `pu`, `hof`, `pol`
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
## License
|
|
150
|
+
|
|
151
|
+
MIT — see [LICENSE](LICENSE).
|
|
152
|
+
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "tprdb-utilities"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A toolkit for accessing and working with data from the CRITT Translation Process Research Database (TPR-DB)."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { text = "MIT" }
|
|
11
|
+
authors = [
|
|
12
|
+
{ name = "Devin Gilbert", email = "DevinG@uvu.edu" }
|
|
13
|
+
]
|
|
14
|
+
requires-python = ">=3.9"
|
|
15
|
+
dependencies = [
|
|
16
|
+
"pandas",
|
|
17
|
+
"requests",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
[project.urls]
|
|
21
|
+
Homepage = "https://github.com/Critt-Kent/tprdb-utilities"
|
|
22
|
+
Issues = "https://github.com/Critt-Kent/tprdb-utilities/issues"
|
|
23
|
+
|
|
24
|
+
[tool.hatch.build.targets.wheel]
|
|
25
|
+
packages = ["src/tprdb_utilities"]
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
import glob
|
|
2
|
+
import io
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
import zipfile
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def fetch_TPRDB_tables(
|
|
11
|
+
path, StudyID, extension, public, username=None, token=None, verbose=0
|
|
12
|
+
):
|
|
13
|
+
"""
|
|
14
|
+
Download TPR-DB data tables from the CRITT TPR-DB API and save them
|
|
15
|
+
locally in a directory structure that mirrors the TPR-DB server layout.
|
|
16
|
+
|
|
17
|
+
Makes one HTTP GET request per extension to the CRITT TPR-DB REST API,
|
|
18
|
+
receives a ``.zip`` archive, and extracts its contents directly into the
|
|
19
|
+
appropriate ``Tables/`` subdirectory. The resulting file structure is
|
|
20
|
+
identical to the layout expected by ``read_TPRDB_tables``, so the two
|
|
21
|
+
functions are designed to be used in sequence.
|
|
22
|
+
|
|
23
|
+
**File structure created**::
|
|
24
|
+
|
|
25
|
+
<path>/
|
|
26
|
+
└── tprdb-mothership-clone/
|
|
27
|
+
├── TPRDB/ ← public studies
|
|
28
|
+
│ └── <StudyID>/
|
|
29
|
+
│ └── Tables/
|
|
30
|
+
│ ├── session1.<ext>
|
|
31
|
+
│ └── ...
|
|
32
|
+
└── <username>/ ← private studies (when public=False)
|
|
33
|
+
└── <StudyID>/
|
|
34
|
+
└── Tables/
|
|
35
|
+
├── session1.<ext>
|
|
36
|
+
└── ...
|
|
37
|
+
|
|
38
|
+
Parameters
|
|
39
|
+
----------
|
|
40
|
+
path : str
|
|
41
|
+
Root directory in which the ``tprdb-mothership-clone`` folder will
|
|
42
|
+
be created (or appended to if it already exists). After downloading,
|
|
43
|
+
pass ``os.path.join(path, "tprdb-mothership-clone")`` as the ``path``
|
|
44
|
+
argument to ``read_TPRDB_tables`` — or simply copy the value from
|
|
45
|
+
the summary printed by this function.
|
|
46
|
+
StudyID : str
|
|
47
|
+
Identifier of the study to download, e.g. ``"DG21"``. Must match a
|
|
48
|
+
study registered in the TPR-DB exactly (case-sensitive).
|
|
49
|
+
extension : list of str
|
|
50
|
+
One or more table-type extensions to download, e.g.
|
|
51
|
+
``["kd", "ss", "st"]``. Valid values include ``"ss"``, ``"sg"``,
|
|
52
|
+
``"st"``, ``"tt"``, ``"kd"``, ``"fd"``, ``"au"``, ``"pu"``,
|
|
53
|
+
``"hof"``, and ``"pol"``. One API request is made per extension;
|
|
54
|
+
extensions with files already present locally are skipped.
|
|
55
|
+
public : bool
|
|
56
|
+
Whether the requested study is publicly accessible.
|
|
57
|
+
|
|
58
|
+
``True``
|
|
59
|
+
No credentials required. Files are saved under
|
|
60
|
+
``tprdb-mothership-clone/TPRDB/<StudyID>/Tables/``.
|
|
61
|
+
|
|
62
|
+
``False``
|
|
63
|
+
Requires ``username`` and ``token``. Files are saved under
|
|
64
|
+
``tprdb-mothership-clone/<username>/<StudyID>/Tables/``.
|
|
65
|
+
|
|
66
|
+
username : str, optional
|
|
67
|
+
Your TPR-DB web application username. **Required when**
|
|
68
|
+
``public=False``. Must match your registered username exactly
|
|
69
|
+
(case-sensitive). Also determines the folder name used for private
|
|
70
|
+
study data, so it must be consistent across calls.
|
|
71
|
+
token : str, optional
|
|
72
|
+
Your TPR-DB API key (Bearer token). **Required when**
|
|
73
|
+
``public=False``. Obtain this from your TPR-DB account settings.
|
|
74
|
+
verbose : int, optional
|
|
75
|
+
Verbosity level. Default ``0``.
|
|
76
|
+
|
|
77
|
+
``1`` or higher
|
|
78
|
+
Print the name of each file extracted from the zip archive for
|
|
79
|
+
every downloaded extension.
|
|
80
|
+
|
|
81
|
+
Returns
|
|
82
|
+
-------
|
|
83
|
+
None
|
|
84
|
+
This function saves files to disk and always prints a summary to
|
|
85
|
+
stdout. It does not return data; use ``read_TPRDB_tables`` to load
|
|
86
|
+
the downloaded files into a DataFrame.
|
|
87
|
+
|
|
88
|
+
Raises
|
|
89
|
+
------
|
|
90
|
+
ValueError
|
|
91
|
+
If ``public=False`` and ``username`` or ``token`` is not provided.
|
|
92
|
+
requests.HTTPError
|
|
93
|
+
If the API returns a non-2xx HTTP status code. The error message
|
|
94
|
+
includes the status code and response body for diagnosis.
|
|
95
|
+
|
|
96
|
+
Notes
|
|
97
|
+
-----
|
|
98
|
+
A summary is always printed after all extensions have been processed,
|
|
99
|
+
regardless of the ``verbose`` setting. The summary includes ready-to-use
|
|
100
|
+
argument values for ``read_TPRDB_tables`` so they can be copied directly
|
|
101
|
+
into your next call.
|
|
102
|
+
|
|
103
|
+
If files matching a given extension already exist in the ``Tables/``
|
|
104
|
+
directory, the API request for that extension is skipped entirely.
|
|
105
|
+
|
|
106
|
+
Examples
|
|
107
|
+
--------
|
|
108
|
+
**Downloading a public study:**
|
|
109
|
+
|
|
110
|
+
>>> from tprdb_utilities import fetch_TPRDB_tables
|
|
111
|
+
>>> fetch_TPRDB_tables(
|
|
112
|
+
... path="/path/to/local/data",
|
|
113
|
+
... StudyID="DG21",
|
|
114
|
+
... extension=["kd", "ss"],
|
|
115
|
+
... public=True,
|
|
116
|
+
... )
|
|
117
|
+
|
|
118
|
+
**Downloading a private study:**
|
|
119
|
+
|
|
120
|
+
>>> from tprdb_utilities import fetch_TPRDB_tables
|
|
121
|
+
>>> fetch_TPRDB_tables(
|
|
122
|
+
... path="/path/to/local/data",
|
|
123
|
+
... StudyID="MYSTUDY",
|
|
124
|
+
... extension=["kd"],
|
|
125
|
+
... public=False,
|
|
126
|
+
... username="myTPRDBusername",
|
|
127
|
+
... token="my-api-token",
|
|
128
|
+
... )
|
|
129
|
+
"""
|
|
130
|
+
if not public and (username is None or token is None):
|
|
131
|
+
raise ValueError(
|
|
132
|
+
"username and token are required when public=False. "
|
|
133
|
+
"Provide your TPR-DB web app username (case-sensitive) and API token."
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
folder_name = "TPRDB" if public else username
|
|
137
|
+
clone_root = os.path.join(path, "tprdb-mothership-clone")
|
|
138
|
+
target_dir = os.path.join(clone_root, folder_name, StudyID, "Tables")
|
|
139
|
+
os.makedirs(target_dir, exist_ok=True)
|
|
140
|
+
|
|
141
|
+
# Strip any leading dots so extensions are consistently bare (e.g. "kd" not ".kd")
|
|
142
|
+
clean_extensions = [ext.lstrip(".") for ext in extension]
|
|
143
|
+
|
|
144
|
+
results = [] # list of (ext, status_str, elapsed_str)
|
|
145
|
+
|
|
146
|
+
for ext in clean_extensions:
|
|
147
|
+
# Skip if files for this extension are already present
|
|
148
|
+
existing = glob.glob(os.path.join(target_dir, f"*{ext}"))
|
|
149
|
+
if existing:
|
|
150
|
+
results.append((ext, "Skipped (already present)", "--"))
|
|
151
|
+
continue
|
|
152
|
+
|
|
153
|
+
url = (
|
|
154
|
+
"https://critt.as.kent.edu/tpr/api/tables/"
|
|
155
|
+
f"?studyID={StudyID}&extension={ext}&public={str(public).lower()}"
|
|
156
|
+
)
|
|
157
|
+
headers = {"Authorization": f"Bearer {token}"} if not public else {}
|
|
158
|
+
|
|
159
|
+
t0 = time.perf_counter()
|
|
160
|
+
response = requests.get(url, headers=headers)
|
|
161
|
+
if not response.ok:
|
|
162
|
+
raise requests.HTTPError(
|
|
163
|
+
f"HTTP {response.status_code} for extension '{ext}': {response.text}"
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
with zipfile.ZipFile(io.BytesIO(response.content)) as zf:
|
|
167
|
+
members = zf.namelist()
|
|
168
|
+
for name in members:
|
|
169
|
+
zf.extract(name, target_dir)
|
|
170
|
+
if verbose:
|
|
171
|
+
for name in members:
|
|
172
|
+
print(f" Extracted: {name}")
|
|
173
|
+
|
|
174
|
+
elapsed = f"{time.perf_counter() - t0:.2f}s"
|
|
175
|
+
results.append((ext, "Downloaded", elapsed))
|
|
176
|
+
|
|
177
|
+
# --- Always-printed summary ---
|
|
178
|
+
col_w = max(max((len(r[0]) for r in results), default=0), len("Extension"))
|
|
179
|
+
status_w = max(max((len(r[1]) for r in results), default=0), len("Status"))
|
|
180
|
+
|
|
181
|
+
print("=== fetch_TPRDB_tables Summary ===")
|
|
182
|
+
print(f"StudyID : {StudyID}")
|
|
183
|
+
print(f"Clone dir: {clone_root}")
|
|
184
|
+
print(f"User dir : {folder_name}")
|
|
185
|
+
print()
|
|
186
|
+
print(f"{'Extension':<{col_w}} {'Status':<{status_w}} Time")
|
|
187
|
+
print(f"{'-' * col_w} {'-' * status_w} ------")
|
|
188
|
+
for ext, status, elapsed in results:
|
|
189
|
+
print(f"{ext:<{col_w}} {status:<{status_w}} {elapsed}")
|
|
190
|
+
print()
|
|
191
|
+
print("To read these files with read_TPRDB_tables:")
|
|
192
|
+
print(f' path = "{clone_root}"')
|
|
193
|
+
print(f' user = "{folder_name}"')
|
|
194
|
+
print(f' studies = ["{StudyID}"]')
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import glob
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def read_TPRDB_tables(studies, extension, mothership, path=None, user="TPRDB", verbose=0):
|
|
8
|
+
"""
|
|
9
|
+
Load TPR-DB data tables into a single concatenated DataFrame.
|
|
10
|
+
|
|
11
|
+
Scans the expected TPR-DB directory layout for files matching the given
|
|
12
|
+
extension across one or more studies and concatenates them into one
|
|
13
|
+
``pandas.DataFrame``.
|
|
14
|
+
|
|
15
|
+
The directory layout expected (and created by ``fetch_TPRDB_tables``) is::
|
|
16
|
+
|
|
17
|
+
<path>/
|
|
18
|
+
└── <user>/
|
|
19
|
+
└── <StudyID>/
|
|
20
|
+
└── Tables/
|
|
21
|
+
├── session1.<extension>
|
|
22
|
+
├── session2.<extension>
|
|
23
|
+
└── ...
|
|
24
|
+
|
|
25
|
+
Parameters
|
|
26
|
+
----------
|
|
27
|
+
studies : list of str
|
|
28
|
+
Study identifiers to load, e.g. ``["BML12", "SG12", "AR22"]``.
|
|
29
|
+
Each must correspond to a subfolder under ``<path>/<user>/``.
|
|
30
|
+
extension : str
|
|
31
|
+
File extension identifying the table type, e.g. ``"kd"``, ``"ss"``,
|
|
32
|
+
``"sg"``, ``"st"``, ``"tt"``, ``"fd"``, ``"au"``, ``"pu"``,
|
|
33
|
+
``"hof"``, or ``"pol"``. A leading dot is not required.
|
|
34
|
+
mothership : bool
|
|
35
|
+
**Required.** Controls how the root path is resolved.
|
|
36
|
+
|
|
37
|
+
``True``
|
|
38
|
+
You are running this function directly on the **CRITT TPR-DB
|
|
39
|
+
server** (the "mothership"). The path is automatically set to
|
|
40
|
+
``/data/critt/tprdb/`` and the ``path`` argument is ignored.
|
|
41
|
+
The ``user`` argument still applies (default ``"TPRDB"`` for
|
|
42
|
+
the public corpus).
|
|
43
|
+
|
|
44
|
+
``False``
|
|
45
|
+
You are working from a **local clone** of the TPR-DB structure,
|
|
46
|
+
either assembled manually or downloaded via
|
|
47
|
+
``fetch_TPRDB_tables``. You *must* supply the ``path``
|
|
48
|
+
argument pointing to the root of that clone (i.e. the
|
|
49
|
+
``tprdb-mothership-clone`` directory created by
|
|
50
|
+
``fetch_TPRDB_tables``).
|
|
51
|
+
|
|
52
|
+
path : str, optional
|
|
53
|
+
Root directory of the local TPR-DB clone. **Required when**
|
|
54
|
+
``mothership=False``; ignored when ``mothership=True``.
|
|
55
|
+
This should be the ``tprdb-mothership-clone`` folder — i.e. the
|
|
56
|
+
full path *including* the ``tprdb-mothership-clone`` segment — that
|
|
57
|
+
was created by ``fetch_TPRDB_tables``.
|
|
58
|
+
user : str, optional
|
|
59
|
+
Name of the user sub-folder directly under ``path``. Default is
|
|
60
|
+
``"TPRDB"``, which corresponds to the public corpus. When working
|
|
61
|
+
with private studies downloaded via ``fetch_TPRDB_tables``, set
|
|
62
|
+
this to your TPR-DB username (the same value passed as ``username``
|
|
63
|
+
to ``fetch_TPRDB_tables``).
|
|
64
|
+
verbose : int, optional
|
|
65
|
+
Verbosity level. Default ``0`` (silent).
|
|
66
|
+
|
|
67
|
+
``1``
|
|
68
|
+
Print the study name and the number of table files found for
|
|
69
|
+
each study.
|
|
70
|
+
|
|
71
|
+
``2`` or higher
|
|
72
|
+
Also print the full path of each file as it is read.
|
|
73
|
+
|
|
74
|
+
Returns
|
|
75
|
+
-------
|
|
76
|
+
pandas.DataFrame
|
|
77
|
+
Concatenated DataFrame containing all rows from all matching table
|
|
78
|
+
files across every requested study. Column names and dtypes are
|
|
79
|
+
inferred automatically. Returns an empty DataFrame if no matching
|
|
80
|
+
files are found.
|
|
81
|
+
|
|
82
|
+
Raises
|
|
83
|
+
------
|
|
84
|
+
ValueError
|
|
85
|
+
If ``mothership=False`` and ``path`` is not provided.
|
|
86
|
+
|
|
87
|
+
Notes
|
|
88
|
+
-----
|
|
89
|
+
Files are expected to be tab-separated values (TSV). Each file
|
|
90
|
+
corresponds to one recording session.
|
|
91
|
+
|
|
92
|
+
The ``extension`` argument is matched as a file-name suffix, so passing
|
|
93
|
+
``"kd"`` will match any file whose name ends with ``"kd"``
|
|
94
|
+
(e.g. ``"P01_DG21_EN-DE.kd"``).
|
|
95
|
+
|
|
96
|
+
Examples
|
|
97
|
+
--------
|
|
98
|
+
**Use case 1 — Running on the CRITT TPR-DB server (mothership=True):**
|
|
99
|
+
|
|
100
|
+
Users with direct access to the CRITT server do not need to specify a
|
|
101
|
+
path; it is resolved automatically.
|
|
102
|
+
|
|
103
|
+
>>> from tprdb_utilities import read_TPRDB_tables
|
|
104
|
+
>>> df = read_TPRDB_tables(
|
|
105
|
+
... studies=["DG21", "AR22"],
|
|
106
|
+
... extension="kd",
|
|
107
|
+
... mothership=True,
|
|
108
|
+
... )
|
|
109
|
+
|
|
110
|
+
**Use case 2 — Reading from a local clone (mothership=False):**
|
|
111
|
+
|
|
112
|
+
Data must have been previously downloaded with ``fetch_TPRDB_tables``
|
|
113
|
+
(or arranged manually in the identical directory structure). Use the
|
|
114
|
+
``path`` and ``user`` values printed by ``fetch_TPRDB_tables`` at the
|
|
115
|
+
end of its summary output.
|
|
116
|
+
|
|
117
|
+
>>> from tprdb_utilities import read_TPRDB_tables
|
|
118
|
+
>>> df = read_TPRDB_tables(
|
|
119
|
+
... studies=["DG21"],
|
|
120
|
+
... extension="kd",
|
|
121
|
+
... mothership=False,
|
|
122
|
+
... path="/path/to/tprdb-mothership-clone",
|
|
123
|
+
... user="TPRDB",
|
|
124
|
+
... )
|
|
125
|
+
"""
|
|
126
|
+
if mothership:
|
|
127
|
+
path = "/data/critt/tprdb/"
|
|
128
|
+
elif path is None:
|
|
129
|
+
raise ValueError(
|
|
130
|
+
"path is required when mothership=False. "
|
|
131
|
+
"Provide the full path to your local TPR-DB clone root — "
|
|
132
|
+
"this is the 'tprdb-mothership-clone' directory created by "
|
|
133
|
+
"fetch_TPRDB_tables. Example: "
|
|
134
|
+
"path='/your/local/data/tprdb-mothership-clone'"
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
df = pd.DataFrame()
|
|
138
|
+
for study in studies:
|
|
139
|
+
pattern = os.path.join(path, user, study, "Tables", f"*{extension}")
|
|
140
|
+
files = glob.glob(pattern)
|
|
141
|
+
if verbose:
|
|
142
|
+
print(f"Reading: {study}\twith {len(files)} '{extension}' Tables")
|
|
143
|
+
for fn in files:
|
|
144
|
+
if verbose > 1:
|
|
145
|
+
print(f"\t{fn}")
|
|
146
|
+
df = pd.concat(
|
|
147
|
+
[df, pd.read_csv(fn, sep="\t", dtype=None)],
|
|
148
|
+
ignore_index=True,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
if verbose:
|
|
152
|
+
print(f"Total '{extension}' data rows: {df.shape[0]}, columns: {df.shape[1]}")
|
|
153
|
+
return df
|