htruc 1.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- htruc/__init__.py +0 -0
- htruc/catalog.py +301 -0
- htruc/cli.py +210 -0
- htruc/repos/__init__.py +2 -0
- htruc/repos/_generic.py +21 -0
- htruc/repos/_github.py +86 -0
- htruc/schemas/__init__.py +34 -0
- htruc/schemas/upgrade_path.py +41 -0
- htruc/types.py +8 -0
- htruc/utils.py +78 -0
- htruc/validator.py +82 -0
- htruc-1.1.4.dist-info/METADATA +82 -0
- htruc-1.1.4.dist-info/RECORD +17 -0
- htruc-1.1.4.dist-info/WHEEL +5 -0
- htruc-1.1.4.dist-info/entry_points.txt +2 -0
- htruc-1.1.4.dist-info/licenses/LICENSE.md +373 -0
- htruc-1.1.4.dist-info/top_level.txt +1 -0
htruc/__init__.py
ADDED
|
File without changes
|
htruc/catalog.py
ADDED
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
from typing import Optional, Dict, Any, List, Tuple, Iterable, Union
|
|
2
|
+
import os
|
|
3
|
+
import logging
|
|
4
|
+
import pandas
|
|
5
|
+
import cffconvert
|
|
6
|
+
import requests
|
|
7
|
+
import re
|
|
8
|
+
|
|
9
|
+
from htruc.repos import get_github_repo_yaml, get_htr_united_repos, get_github_repo_cff
|
|
10
|
+
from htruc.utils import parse_yaml
|
|
11
|
+
from htruc import validator
|
|
12
|
+
from htruc.schemas import recursive_update
|
|
13
|
+
from htruc.types import CatalogRecord, Catalog
|
|
14
|
+
logger = logging.getLogger()
|
|
15
|
+
logger.setLevel(logging.INFO)
|
|
16
|
+
|
|
17
|
+
_ZenodoRecord: re.Pattern = re.compile(r"zenodo\.org/record/([0-9]+)")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _clean_a_dict(catalog: Catalog) -> Catalog:
|
|
21
|
+
invalid: List[str] = []
|
|
22
|
+
|
|
23
|
+
for schema in catalog:
|
|
24
|
+
for status in validator.run([catalog[schema]], schema_path="auto"):
|
|
25
|
+
if not status.status: # If the schema is invalid
|
|
26
|
+
invalid.append(schema)
|
|
27
|
+
logging.warning(f"Invalid schema file for {schema}")
|
|
28
|
+
|
|
29
|
+
for key in invalid:
|
|
30
|
+
del catalog[key]
|
|
31
|
+
|
|
32
|
+
return catalog
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _upgrade_a_dict(catalog: Catalog) -> Catalog:
|
|
36
|
+
for schema in catalog:
|
|
37
|
+
new_version, nb_upgrade = recursive_update(catalog[schema])
|
|
38
|
+
if nb_upgrade:
|
|
39
|
+
catalog[schema] = new_version
|
|
40
|
+
|
|
41
|
+
return catalog
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_local_yaml(directory: str, keep_valid_only: bool = True) -> Catalog:
|
|
45
|
+
""" Reads all local YAML file in a given directory and parses them as Catalog Record.
|
|
46
|
+
|
|
47
|
+
:param directory: Directory to scan
|
|
48
|
+
:param keep_valid_only: Only keeps passing HTR-United files
|
|
49
|
+
|
|
50
|
+
"""
|
|
51
|
+
out = {}
|
|
52
|
+
for root, dirs, files in os.walk(directory):
|
|
53
|
+
for file in files:
|
|
54
|
+
if file.endswith(".yml") or file.endswith(".yaml"):
|
|
55
|
+
try:
|
|
56
|
+
data = parse_yaml(os.path.join(root, file))
|
|
57
|
+
out[data.get("url")] = data
|
|
58
|
+
except Exception as E:
|
|
59
|
+
logging.warning(f"Impossible to parse and understand {file}")
|
|
60
|
+
logger.info(str(E))
|
|
61
|
+
if keep_valid_only:
|
|
62
|
+
_clean_a_dict(out)
|
|
63
|
+
return out
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def clever_catalog_update(catalog1: Dict, catalog2: Dict) -> Dict:
|
|
67
|
+
for repository in catalog2:
|
|
68
|
+
if repository in catalog1:
|
|
69
|
+
for key in ["characters", "volume"]:
|
|
70
|
+
if key in catalog2[repository]:
|
|
71
|
+
catalog1[repository][key] = catalog2[repository][key]
|
|
72
|
+
else:
|
|
73
|
+
catalog1[repository] = catalog2[repository]
|
|
74
|
+
return catalog1
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def get_all_catalogs(
|
|
78
|
+
access_token: Optional[str] = None,
|
|
79
|
+
local_directory: Optional[str] = None,
|
|
80
|
+
get_distant: bool = True,
|
|
81
|
+
organizations: Optional[Union[str, Iterable[str]]] = "htr-united",
|
|
82
|
+
check_link: bool = False,
|
|
83
|
+
ignore_orgs_gits: List[str] = None,
|
|
84
|
+
keep_valid_only: bool = True,
|
|
85
|
+
auto_upgrade: bool = False,
|
|
86
|
+
citation_cff: bool = False
|
|
87
|
+
) -> Catalog:
|
|
88
|
+
""" Retrieve repositories from various location (online, locally) and create a catalog out of the records.
|
|
89
|
+
|
|
90
|
+
:param access_token: Github Access Token to retrieve information ~ without limit from Github.com
|
|
91
|
+
:param local_directory: Local directory to scan for files
|
|
92
|
+
:param get_distant: Retrieves data from organisations on Github (Scan all their repositories)
|
|
93
|
+
:param organizations: Organizations to scan
|
|
94
|
+
:param check_link: If a local directory catalog record links to a github repository, scan the remote repository
|
|
95
|
+
for any updates on the catalog
|
|
96
|
+
:param ignore_orgs_gits: Ignore specific repositories in the scan
|
|
97
|
+
:param keep_valid_only: Only Keeps valid catalog record
|
|
98
|
+
:param auto_upgrade: Upgrade automatically all schemas to the latest version (Only applied if keep_valid_only is
|
|
99
|
+
True)
|
|
100
|
+
"""
|
|
101
|
+
data: Catalog = {}
|
|
102
|
+
if local_directory:
|
|
103
|
+
data.update(get_local_yaml(directory=local_directory, keep_valid_only=False))
|
|
104
|
+
if check_link:
|
|
105
|
+
for uri in data:
|
|
106
|
+
# We update the catalog if needs be by checking each repo
|
|
107
|
+
if "github.com" in uri:
|
|
108
|
+
print(f"Fetching {uri} remotely to update metrics")
|
|
109
|
+
results = get_github_repo_yaml(address=uri, access_token=access_token)
|
|
110
|
+
if results:
|
|
111
|
+
data[uri] = results
|
|
112
|
+
if get_distant:
|
|
113
|
+
if isinstance(organizations, str):
|
|
114
|
+
organizations = (organizations, )
|
|
115
|
+
for orga in organizations:
|
|
116
|
+
data = clever_catalog_update(
|
|
117
|
+
data,
|
|
118
|
+
get_htr_united_repos(
|
|
119
|
+
access_token=access_token,
|
|
120
|
+
main_organization=orga,
|
|
121
|
+
exclude=ignore_orgs_gits
|
|
122
|
+
)
|
|
123
|
+
)
|
|
124
|
+
if keep_valid_only:
|
|
125
|
+
_clean_a_dict(data)
|
|
126
|
+
if auto_upgrade and keep_valid_only:
|
|
127
|
+
_upgrade_a_dict(data)
|
|
128
|
+
if citation_cff:
|
|
129
|
+
for key in data:
|
|
130
|
+
up = _get_bibtex_and_apa(data[key], access_token=access_token)
|
|
131
|
+
if up:
|
|
132
|
+
logger.info(f"Successfully retrieved Bibtex or/and APA for {key}")
|
|
133
|
+
data[key].update(up)
|
|
134
|
+
return dict(sorted(data.items()))
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def get_statistics(repositories: Catalog) -> pandas.DataFrame:
|
|
138
|
+
""" Retrieve statistics from a diction of repositories
|
|
139
|
+
|
|
140
|
+
:param repositories: Dictionary of Repositories records
|
|
141
|
+
|
|
142
|
+
>>> #x = get_all_catalogs(local_directory="/home/thibault/dev/htr-united", check_link=False, get_distant=False)
|
|
143
|
+
>>> #get_statistics(x).groupby(by="metric").sum()
|
|
144
|
+
"""
|
|
145
|
+
df = [
|
|
146
|
+
|
|
147
|
+
]
|
|
148
|
+
for repository, entry in repositories.items():
|
|
149
|
+
try:
|
|
150
|
+
begin, end = entry["time"]["notBefore"], entry["time"]["notAfter"]
|
|
151
|
+
for a_volume in entry.get("volume", []):
|
|
152
|
+
df.append({
|
|
153
|
+
"uri": repository,
|
|
154
|
+
"title": entry["title"],
|
|
155
|
+
"start": int(begin),
|
|
156
|
+
"end": int(end),
|
|
157
|
+
"metric": a_volume["metric"].lower(),
|
|
158
|
+
"count": int(a_volume["count"]),
|
|
159
|
+
"format": entry["format"],
|
|
160
|
+
"script-type": entry["script-type"]
|
|
161
|
+
})
|
|
162
|
+
except KeyError:
|
|
163
|
+
logger.warning(f"Unable to parse {repository} for statistics")
|
|
164
|
+
except TypeError:
|
|
165
|
+
logger.warning(f"Unable to parse {repository} for statistics")
|
|
166
|
+
return pandas.DataFrame(df)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def group_per_year(df: pandas.DataFrame, column: Optional[str] = "metric", period: int = 50):
|
|
170
|
+
""" Group a column per year
|
|
171
|
+
|
|
172
|
+
>>> group_per_year(pandas.DataFrame([
|
|
173
|
+
... {"start": 1300, "end": 1399, "metric": "line", "count": 134},
|
|
174
|
+
... {"start": 1300, "end": 1399, "metric": "characters", "count": 234},
|
|
175
|
+
... {"start": 1350, "end": 1449, "metric": "line", "count": 34},
|
|
176
|
+
... {"start": 1500, "end": 1551, "metric": "line", "count": 37},
|
|
177
|
+
... {"start": 1503, "end": 1504, "metric": "line", "count": 37},
|
|
178
|
+
... {"start": 1300, "end": 1499, "metric": "line", "count": 2}]))
|
|
179
|
+
year characters line
|
|
180
|
+
0 1300 234 136
|
|
181
|
+
1 1350 234 170
|
|
182
|
+
2 1400 0 36
|
|
183
|
+
3 1450 0 2
|
|
184
|
+
4 1500 0 74
|
|
185
|
+
5 1550 0 37
|
|
186
|
+
|
|
187
|
+
"""
|
|
188
|
+
new_df = [
|
|
189
|
+
{
|
|
190
|
+
"year": range_start,
|
|
191
|
+
**df[
|
|
192
|
+
df.start.between(range_start, range_start+period-1) | \
|
|
193
|
+
df.end.between(range_start, range_start+period-1) | \
|
|
194
|
+
((df.start < range_start) & (range_start+period-1 < df.end))
|
|
195
|
+
].groupby(column)["count"].sum()
|
|
196
|
+
}
|
|
197
|
+
for range_start in range(
|
|
198
|
+
df.start.min() // period * period,
|
|
199
|
+
period * (df.end.max() // period) + int(bool(df.end.max() % period)),
|
|
200
|
+
period
|
|
201
|
+
)
|
|
202
|
+
]
|
|
203
|
+
return pandas.DataFrame(new_df).fillna(0).astype(int)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
MetricLists = List[Dict[str, int]]
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def update_volume(original_volume: MetricLists, metrics: MetricLists) -> Tuple[MetricLists, MetricLists]:
|
|
210
|
+
""" Compute the new metrics for a catalog, returns a difference list as a second output
|
|
211
|
+
|
|
212
|
+
>>> old = [{"metric": "pages", "count": 5}, {"metric": "documents", "count": 5}]
|
|
213
|
+
>>> new = [{"metric": "pages", "count": 10}, {"metric": "line", "count": 105}]
|
|
214
|
+
>>> out = update_volume(old, new)
|
|
215
|
+
>>> out == (
|
|
216
|
+
... [{"metric": "documents", "count": 5}, {"metric": "line", "count": 105}, {"metric": "pages", "count": 10}],
|
|
217
|
+
... [{"metric": "pages", "count": 5}],
|
|
218
|
+
... )
|
|
219
|
+
True
|
|
220
|
+
|
|
221
|
+
"""
|
|
222
|
+
old = {vol["metric"]: vol["count"] for vol in original_volume}
|
|
223
|
+
new = {vol["metric"]: vol["count"] for vol in metrics}
|
|
224
|
+
all_keys = sorted(list(set(old.keys()).union(set(new.keys()))))
|
|
225
|
+
diff = {key: new.get(key) - old.get(key) for key in all_keys if key in old and key in new}
|
|
226
|
+
return (
|
|
227
|
+
[{"metric": key, "count": new.get(key, old.get(key))} for key in all_keys],
|
|
228
|
+
[{"metric": key, "count": diff.get(key)} for key in diff]
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def _get_bibtex_and_apa(catalog_record: CatalogRecord, access_token: Optional[str]=None) -> Dict[str, str]:
|
|
233
|
+
"""
|
|
234
|
+
|
|
235
|
+
"""
|
|
236
|
+
through_github = _get_github_citation_file(catalog_record, access_token)
|
|
237
|
+
if through_github:
|
|
238
|
+
return through_github
|
|
239
|
+
|
|
240
|
+
if _ZenodoRecord.search(catalog_record["url"]):
|
|
241
|
+
record = _ZenodoRecord.findall(catalog_record["url"])[0]
|
|
242
|
+
try:
|
|
243
|
+
req = requests.get(f"https://zenodo.org/api/records/{record}", headers={"Accept": "application/x-bibtex"})
|
|
244
|
+
req.raise_for_status()
|
|
245
|
+
return {"_bibtex": req.text}
|
|
246
|
+
except Exception as E:
|
|
247
|
+
logger.error(f"Unable to reach Zenodo for Bibtex ({catalog_record['url']}): {E}")
|
|
248
|
+
|
|
249
|
+
if "doi.org" in catalog_record["url"]:
|
|
250
|
+
try:
|
|
251
|
+
req = requests.get(catalog_record["url"], headers={"Accept": "application/x-bibtex"})
|
|
252
|
+
req.raise_for_status()
|
|
253
|
+
return {"_bibtex": req.text}
|
|
254
|
+
except Exception as E:
|
|
255
|
+
logger.error(f"Unable to reach the DOI API for Bibtex ({catalog_record['url']}): {E}")
|
|
256
|
+
|
|
257
|
+
return {}
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def _get_github_citation_file(catalog_record: CatalogRecord, access_token: Optional[str] = None) -> Dict[str, str]:
|
|
261
|
+
if "citation-file-link" not in catalog_record and "github.com" not in catalog_record["url"]:
|
|
262
|
+
return {}
|
|
263
|
+
elif "citation-file-link" not in catalog_record:
|
|
264
|
+
citation_file_content = get_github_repo_cff(catalog_record["url"], access_token=access_token)
|
|
265
|
+
if not citation_file_content:
|
|
266
|
+
return {}
|
|
267
|
+
else: # We got a URI
|
|
268
|
+
try:
|
|
269
|
+
req = requests.get(catalog_record["citation-file-link"])
|
|
270
|
+
req.raise_for_status()
|
|
271
|
+
citation_file_content = req.text
|
|
272
|
+
if "</html>" in citation_file_content.lower():
|
|
273
|
+
raise Exception("CFF File link is wrong, it returns HTML.")
|
|
274
|
+
elif citation_file_content[0] == "{":
|
|
275
|
+
raise Exception("Got JSON at the given endpoint instead of YAML")
|
|
276
|
+
except Exception as E:
|
|
277
|
+
logger.error(f"Error retrieving CITATION File for {catalog_record['citation-file-link']}: {str(E)}")
|
|
278
|
+
if "github.com" in catalog_record["url"]:
|
|
279
|
+
logger.error(f"Trying to reach github directly")
|
|
280
|
+
return _get_github_citation_file({"url": catalog_record["url"]}, access_token=access_token)
|
|
281
|
+
return {}
|
|
282
|
+
|
|
283
|
+
try:
|
|
284
|
+
citation = cffconvert.Citation(citation_file_content)
|
|
285
|
+
except Exception as E:
|
|
286
|
+
logger.error(f"Unable to parse CFF for {catalog_record['url']} ({E})")
|
|
287
|
+
nl = "\n"
|
|
288
|
+
logger.error(f"Content: \n>>> {citation_file_content.replace(nl, nl+'>>> ')}")
|
|
289
|
+
return {}
|
|
290
|
+
return_obj = {}
|
|
291
|
+
try:
|
|
292
|
+
return_obj["_bibtex"] = citation.as_bibtex()
|
|
293
|
+
except Exception as E:
|
|
294
|
+
logger.error(f"Unable to parse as Bibtex {catalog_record['url']} ({E})")
|
|
295
|
+
|
|
296
|
+
try:
|
|
297
|
+
return_obj["_apa"] = citation.as_apalike()
|
|
298
|
+
except Exception as E:
|
|
299
|
+
logger.error(f"Unable to parse as APA {catalog_record['url']} ({E})")
|
|
300
|
+
|
|
301
|
+
return return_obj
|
htruc/cli.py
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import click
|
|
3
|
+
import os.path
|
|
4
|
+
from ruamel.yaml import YAML
|
|
5
|
+
import json
|
|
6
|
+
from typing import Optional, List
|
|
7
|
+
|
|
8
|
+
from htruc.validator import run
|
|
9
|
+
from htruc.catalog import get_all_catalogs, get_statistics, group_per_year, update_volume, _get_bibtex_and_apa
|
|
10
|
+
from htruc.utils import parse_yaml, create_json_catalog, get_local_or_download, dump_yaml
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _error(message):
|
|
14
|
+
click.echo(
|
|
15
|
+
click.style(message, fg="red"),
|
|
16
|
+
color=True
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@click.group()
|
|
21
|
+
def cli():
|
|
22
|
+
""" Interface for HTRUC """
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@cli.command("test")
|
|
26
|
+
@click.argument("files", type=click.File(), nargs=-1)
|
|
27
|
+
@click.option(
|
|
28
|
+
"--version", type=str, default="auto", show_default=True,
|
|
29
|
+
help="Date of the schema version"
|
|
30
|
+
)
|
|
31
|
+
@click.option("--force-download", is_flag=True, help="Download the schema using the version provided")
|
|
32
|
+
def test(files, version: str, force_download: bool):
|
|
33
|
+
""" Test catalog files """
|
|
34
|
+
click.echo(f"{len(files)} to be tested")
|
|
35
|
+
statuses = []
|
|
36
|
+
if version != "auto":
|
|
37
|
+
version = get_local_or_download(version, force_download=force_download)
|
|
38
|
+
for status in run(files, schema_path=version):
|
|
39
|
+
statuses.append(status.status)
|
|
40
|
+
if status.status is False:
|
|
41
|
+
_error(f"☒ File `{status.filename}` testing failed")
|
|
42
|
+
for message in status.messages:
|
|
43
|
+
_error(f" {message}")
|
|
44
|
+
click.echo()
|
|
45
|
+
click.echo(
|
|
46
|
+
click.style(
|
|
47
|
+
f"{statuses.count(True)/len(statuses)*100:.2f}% of schema passed ({statuses.count(True)}/{len(statuses)})",
|
|
48
|
+
fg="red" if False in statuses else "green"
|
|
49
|
+
)
|
|
50
|
+
)
|
|
51
|
+
sys.exit(-1 if False in statuses else 0)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@cli.command("make")
|
|
55
|
+
@click.argument("directory", default="./catalog/")
|
|
56
|
+
@click.option("-o", "--organization", default=("htr-united", ), show_default=True, multiple=True,
|
|
57
|
+
help="Organization to retrieve repositories from")
|
|
58
|
+
@click.option("--remote/--no-remote", is_flag=True, default=True, show_default=True,
|
|
59
|
+
help="Retrieve data from remote repositories in the organization's account")
|
|
60
|
+
@click.option("--clean/--dirty", is_flag=True, default=True, show_default=True,
|
|
61
|
+
help="Keep only the valid catalog records")
|
|
62
|
+
@click.option("--auto-upgrade/--no-auto-upgrade", is_flag=True, default=True, show_default=True,
|
|
63
|
+
help="Automatically upgrade to the latest schema")
|
|
64
|
+
@click.option("--citation/--no-citation", is_flag=True, default=True, show_default=True,
|
|
65
|
+
help="Retrieve CITATION.CFF from repositories and creates unstandardized _apa and _bibtex properties "
|
|
66
|
+
"for each record")
|
|
67
|
+
@click.option("--check-link", is_flag=True, default=False, show_default=True,
|
|
68
|
+
help="For each github repository documented in the local files, tries to download a `htr-united.yaml`"
|
|
69
|
+
" file from it.")
|
|
70
|
+
@click.option("--output", default="catalog.yaml", show_default=True,
|
|
71
|
+
help="Dumps the agglutinated catalog as YAML")
|
|
72
|
+
@click.option("--json", default=None, show_default=True,
|
|
73
|
+
help="Dumps the whole catalog as JSON too")
|
|
74
|
+
@click.option("--graph", default=None, show_default=True,
|
|
75
|
+
help="Produce a graph at the path given (PNG Files please) with the amount of metrics"
|
|
76
|
+
"at different times")
|
|
77
|
+
@click.option("--graph-csv", default=None, show_default=True,
|
|
78
|
+
help="Outputs the data behind the graph into a CSV file")
|
|
79
|
+
@click.option("--access_token", default=None, show_default=True,
|
|
80
|
+
help="Github Access token")
|
|
81
|
+
@click.option("--statistics", default=None, show_default=True,
|
|
82
|
+
help="Produce a recap CSV file with different statistics about the period covered by the dataset")
|
|
83
|
+
@click.option("--ignore-repo", default=["htr-united", "template-htr-united-datarepo", "template-depot"], multiple=True, show_default=True,
|
|
84
|
+
help="Repos of the main organization that can be ignored")
|
|
85
|
+
@click.option("--ids", default="ids.json", type=click.Path(dir_okay=False), show_default=True,
|
|
86
|
+
help="JSON file with IDs that maps each repository URLs")
|
|
87
|
+
def make(directory, organization: str, access_token: Optional[str] = None, remote: bool = True,
|
|
88
|
+
check_link: bool = False, output: str = "catalog.yaml",
|
|
89
|
+
json: Optional[str] = None,
|
|
90
|
+
graph: Optional[str] = None,
|
|
91
|
+
statistics: Optional[str] = None,
|
|
92
|
+
graph_csv: Optional[str] = None,
|
|
93
|
+
ignore_repo: List[str] = None,
|
|
94
|
+
ids: click.File = None,
|
|
95
|
+
auto_upgrade: bool = True,
|
|
96
|
+
clean: bool = True,
|
|
97
|
+
citation: bool = True):
|
|
98
|
+
""" Generate a catalog from a main repository and an organization
|
|
99
|
+
|
|
100
|
+
"""
|
|
101
|
+
catalog = get_all_catalogs(
|
|
102
|
+
access_token=access_token,
|
|
103
|
+
organizations=organization,
|
|
104
|
+
local_directory=directory,
|
|
105
|
+
get_distant=remote,
|
|
106
|
+
check_link=check_link,
|
|
107
|
+
ignore_orgs_gits=ignore_repo,
|
|
108
|
+
keep_valid_only=clean,
|
|
109
|
+
auto_upgrade=auto_upgrade,
|
|
110
|
+
citation_cff=citation
|
|
111
|
+
)
|
|
112
|
+
click.echo(f"Dumping YAML output into {output}")
|
|
113
|
+
with open(output, "w") as f:
|
|
114
|
+
dump_yaml(list(catalog.values()), f, sort_keys=False)
|
|
115
|
+
|
|
116
|
+
if json:
|
|
117
|
+
click.echo(f"Dumping JSON output into {json}")
|
|
118
|
+
from json import dump
|
|
119
|
+
with open(json, "w") as f:
|
|
120
|
+
dump(create_json_catalog(catalog, ids_files=ids), f)
|
|
121
|
+
if graph or statistics or graph_csv:
|
|
122
|
+
stats = get_statistics(catalog)
|
|
123
|
+
if statistics:
|
|
124
|
+
click.echo(f"Writing stats to {statistics}")
|
|
125
|
+
stats.to_csv(statistics)
|
|
126
|
+
if graph or graph_csv:
|
|
127
|
+
data = group_per_year(stats)
|
|
128
|
+
if graph_csv:
|
|
129
|
+
click.echo(f"Plotting stats to {graph_csv}")
|
|
130
|
+
data.to_csv(graph_csv)
|
|
131
|
+
if graph:
|
|
132
|
+
click.echo(f"Plotting {len(data.columns)-1} files with {graph} basename")
|
|
133
|
+
basedir, basename = os.path.dirname(graph), os.path.basename(graph)
|
|
134
|
+
basename = ".".join(basename.split(".")[:-1])
|
|
135
|
+
import matplotlib.pyplot as plot
|
|
136
|
+
|
|
137
|
+
num_axes = len(data.columns) - 1
|
|
138
|
+
nrows = num_axes // 2 + int(bool(num_axes % 2))
|
|
139
|
+
fig, axes = plot.subplots(
|
|
140
|
+
nrows=nrows,
|
|
141
|
+
ncols=2,
|
|
142
|
+
sharex=True,
|
|
143
|
+
squeeze=True,
|
|
144
|
+
figsize=(10, 5 * nrows),
|
|
145
|
+
dpi=300
|
|
146
|
+
)
|
|
147
|
+
cols = [col for col in data.columns if col != "year"]
|
|
148
|
+
for metric, ax in zip(cols, [c for r in axes for c in r]):
|
|
149
|
+
data.plot.line(x="year", y=metric, ax=ax)
|
|
150
|
+
fig.savefig(graph)
|
|
151
|
+
click.echo(f"Saved {graph}")
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
@cli.command("update-volumes")
|
|
155
|
+
@click.argument("catalog-file", type=click.File(), nargs=1)
|
|
156
|
+
@click.argument("metrics-json", type=click.File(), nargs=1)
|
|
157
|
+
@click.option(
|
|
158
|
+
"--inplace", type=bool, is_flag=True, default=False, show_default=True,
|
|
159
|
+
help="Saves the modified catalog inside the original file"
|
|
160
|
+
)
|
|
161
|
+
def catalog_volume_update(catalog_file, metrics_json, inplace):
|
|
162
|
+
""" Update the metrics of a file """
|
|
163
|
+
record = parse_yaml(catalog_file)
|
|
164
|
+
parsed_metrics = json.load(metrics_json)
|
|
165
|
+
metrics_volume = parsed_metrics["volume"]
|
|
166
|
+
updated, difference = update_volume(record.get("volume", []), metrics_volume)
|
|
167
|
+
record["volume"] = updated
|
|
168
|
+
for metric in difference:
|
|
169
|
+
if metric["count"] < 0:
|
|
170
|
+
click.echo(click.style(f"> The category `{metric['metric']}` decreased by {abs(metric['count'])}",
|
|
171
|
+
fg="yellow"))
|
|
172
|
+
else:
|
|
173
|
+
click.echo(click.style(f"> The category `{metric['metric']}` increased by {metric['count']}", fg="green"))
|
|
174
|
+
|
|
175
|
+
# Close the original file
|
|
176
|
+
catalog_file.close()
|
|
177
|
+
|
|
178
|
+
if record["schema"] != "2021-10-15" and "characters" in parsed_metrics:
|
|
179
|
+
if "characters" not in record:
|
|
180
|
+
record["characters"] = {}
|
|
181
|
+
record["characters"].update(parsed_metrics["characters"])
|
|
182
|
+
|
|
183
|
+
filename = f"{catalog_file.name}"
|
|
184
|
+
if not inplace:
|
|
185
|
+
filename = filename.split(".")
|
|
186
|
+
filename = ".".join([*filename[:-1], "auto-update", filename[-1]])
|
|
187
|
+
click.echo(f"Writing the update volumes in {filename}")
|
|
188
|
+
with open(filename, "w") as f:
|
|
189
|
+
dump_yaml(record, f, sort_keys=False)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
@cli.command("upgrade")
|
|
193
|
+
@click.argument("files", type=click.File(), nargs=-1)
|
|
194
|
+
def upgrade(files):
|
|
195
|
+
""" Upgrade [FILES] to the latest supported schema """
|
|
196
|
+
for file in files:
|
|
197
|
+
click.echo(click.style(f"Upgrading {file.name}", fg="green"))
|
|
198
|
+
catalog = parse_yaml(file)
|
|
199
|
+
from htruc.schemas import recursive_update
|
|
200
|
+
catalog, upgrade_order = recursive_update(catalog)
|
|
201
|
+
if not upgrade_order:
|
|
202
|
+
click.echo(click.style(f"--> No upgrade required", fg="yellow"))
|
|
203
|
+
continue
|
|
204
|
+
file.close()
|
|
205
|
+
with open(file.name, "w") as f:
|
|
206
|
+
dump_yaml(catalog, f, sort_keys=False)
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
if __name__ == "__main__":
|
|
210
|
+
cli()
|
htruc/repos/__init__.py
ADDED
htruc/repos/_generic.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from typing import Optional, Dict, Any
|
|
2
|
+
import requests
|
|
3
|
+
from htruc.utils import parse_yaml
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
Catalog = Dict[str, Any]
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_a_yaml(address: str, raise_on_parse_error: bool = False) -> Optional[Catalog]:
|
|
10
|
+
req = requests.get(address)
|
|
11
|
+
if req.status_code >= 400:
|
|
12
|
+
return None
|
|
13
|
+
|
|
14
|
+
yaml = req.text
|
|
15
|
+
try:
|
|
16
|
+
return parse_yaml(yaml)
|
|
17
|
+
except Exception as E:
|
|
18
|
+
print(f"Parse error on {address}")
|
|
19
|
+
if raise_on_parse_error:
|
|
20
|
+
raise
|
|
21
|
+
return None
|
htruc/repos/_github.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from typing import Optional, Dict, Any, Iterable
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
import github
|
|
5
|
+
from ruamel.yaml import YAML, parser
|
|
6
|
+
from github import Github
|
|
7
|
+
from github.GithubException import UnknownObjectException
|
|
8
|
+
from htruc.utils import parse_yaml
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
Catalog = Dict[str, Any]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_github_repo_yaml(
|
|
15
|
+
address: str,
|
|
16
|
+
access_token: Optional[str] = None,
|
|
17
|
+
raise_on_parse_error: bool = False) -> Optional[Catalog]:
|
|
18
|
+
"""
|
|
19
|
+
>>> get_github_repo_yaml("github.com/htr-united/cremma-medieval.git")["title"]
|
|
20
|
+
'Cremma Medieval'
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
user, repo_name = re.findall("github.com/([^/]+)/([^/]+)", address)[0]
|
|
24
|
+
if repo_name.endswith(".git"):
|
|
25
|
+
repo_name = repo_name[:-4]
|
|
26
|
+
g = Github(access_token)
|
|
27
|
+
try:
|
|
28
|
+
repo = g.get_repo(f"{user}/{repo_name}")
|
|
29
|
+
text = repo.get_contents("htr-united.yml").decoded_content.decode()
|
|
30
|
+
print("--- Found htr-united.yml")
|
|
31
|
+
except UnknownObjectException as e:
|
|
32
|
+
return None
|
|
33
|
+
except github.GithubException as e:
|
|
34
|
+
return None
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
return parse_yaml(text)
|
|
38
|
+
except parser.ParserError:
|
|
39
|
+
print(f"Parse error on {user}/{repo_name}")
|
|
40
|
+
if raise_on_parse_error:
|
|
41
|
+
raise
|
|
42
|
+
return None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def get_github_repo_cff(
|
|
46
|
+
address: str,
|
|
47
|
+
access_token: Optional[str] = None) -> Optional[str]:
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
>>> get_github_repo_yaml("github.com/htr-united/cremma-medieval.git")["title"]
|
|
51
|
+
'Cremma Medieval'
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
user, repo_name = re.findall("github.com/([^/]+)/([^/]+)", address)[0]
|
|
55
|
+
if repo_name.endswith(".git"):
|
|
56
|
+
repo_name = repo_name[:-4]
|
|
57
|
+
g = Github(access_token)
|
|
58
|
+
repo = g.get_repo(f"{user}/{repo_name}")
|
|
59
|
+
for github_content in repo.get_contents(""):
|
|
60
|
+
if github_content.name.lower() == "citation.cff":
|
|
61
|
+
try:
|
|
62
|
+
text = repo.get_contents(github_content.name).decoded_content.decode()
|
|
63
|
+
return text
|
|
64
|
+
except UnknownObjectException as e:
|
|
65
|
+
return None
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def get_htr_united_repos(
|
|
69
|
+
access_token: Optional[str] = None,
|
|
70
|
+
main_organization: str = "htr-united",
|
|
71
|
+
exclude: Iterable[str] = ("htr-united", "template-htr-united-datarepo", )
|
|
72
|
+
) -> Dict[str, Catalog]:
|
|
73
|
+
""" Get a single repo specific tokens
|
|
74
|
+
|
|
75
|
+
>>> get_htr_united_repos()
|
|
76
|
+
"""
|
|
77
|
+
g = Github(access_token)
|
|
78
|
+
o = g.get_organization(main_organization)
|
|
79
|
+
out = {}
|
|
80
|
+
for repo in o.get_repos(type="public"):
|
|
81
|
+
if repo.name in exclude:
|
|
82
|
+
continue
|
|
83
|
+
data = get_github_repo_yaml(repo.clone_url, access_token=access_token)
|
|
84
|
+
if data:
|
|
85
|
+
out[repo.full_name] = data
|
|
86
|
+
return out
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from typing import Dict, Tuple, Callable, AnyStr, Iterable
|
|
2
|
+
from htruc.schemas.upgrade_path import upgrade_2021_10_15_to_2022_04_15, upgrade_2022_04_15_to_2023_06_27
|
|
3
|
+
from htruc.types import SchemaVersion, CatalogRecord
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
# This Tuple is used to know which schema are supported
|
|
7
|
+
UpgradeOrder: Tuple[SchemaVersion, ...] = (
|
|
8
|
+
"https://htr-united.github.io/schema/2021-10-15/schema.json",
|
|
9
|
+
"https://htr-united.github.io/schema/2022-04-15/schema.json",
|
|
10
|
+
"https://htr-united.github.io/schema/2023-06-27/schema.json",
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
UpgradeFunction: Dict[str, Callable[[CatalogRecord], CatalogRecord]] = {
|
|
14
|
+
UpgradeOrder[0]: upgrade_2021_10_15_to_2022_04_15,
|
|
15
|
+
UpgradeOrder[1]: upgrade_2022_04_15_to_2023_06_27
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def recursive_update(catalog_record: CatalogRecord) -> Tuple[CatalogRecord, Iterable[SchemaVersion]]:
|
|
20
|
+
""" Automatically upgrade a schema
|
|
21
|
+
|
|
22
|
+
:returns: The catalog record upgrade to the latest schema, with the list of upgrade it went through
|
|
23
|
+
|
|
24
|
+
"""
|
|
25
|
+
version = catalog_record["schema"]
|
|
26
|
+
# If the version is the latest, no need to upgrade
|
|
27
|
+
if version == UpgradeOrder[-1]:
|
|
28
|
+
return catalog_record, []
|
|
29
|
+
current_version_index = UpgradeOrder.index(version)
|
|
30
|
+
|
|
31
|
+
for version in UpgradeOrder[current_version_index:-1]:
|
|
32
|
+
catalog_record = UpgradeFunction[version](catalog_record)
|
|
33
|
+
|
|
34
|
+
return catalog_record, UpgradeOrder[current_version_index:-1]
|