htruc 1.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
htruc/__init__.py ADDED
File without changes
htruc/catalog.py ADDED
@@ -0,0 +1,301 @@
1
+ from typing import Optional, Dict, Any, List, Tuple, Iterable, Union
2
+ import os
3
+ import logging
4
+ import pandas
5
+ import cffconvert
6
+ import requests
7
+ import re
8
+
9
+ from htruc.repos import get_github_repo_yaml, get_htr_united_repos, get_github_repo_cff
10
+ from htruc.utils import parse_yaml
11
+ from htruc import validator
12
+ from htruc.schemas import recursive_update
13
+ from htruc.types import CatalogRecord, Catalog
14
+ logger = logging.getLogger()
15
+ logger.setLevel(logging.INFO)
16
+
17
+ _ZenodoRecord: re.Pattern = re.compile(r"zenodo\.org/record/([0-9]+)")
18
+
19
+
20
+ def _clean_a_dict(catalog: Catalog) -> Catalog:
21
+ invalid: List[str] = []
22
+
23
+ for schema in catalog:
24
+ for status in validator.run([catalog[schema]], schema_path="auto"):
25
+ if not status.status: # If the schema is invalid
26
+ invalid.append(schema)
27
+ logging.warning(f"Invalid schema file for {schema}")
28
+
29
+ for key in invalid:
30
+ del catalog[key]
31
+
32
+ return catalog
33
+
34
+
35
+ def _upgrade_a_dict(catalog: Catalog) -> Catalog:
36
+ for schema in catalog:
37
+ new_version, nb_upgrade = recursive_update(catalog[schema])
38
+ if nb_upgrade:
39
+ catalog[schema] = new_version
40
+
41
+ return catalog
42
+
43
+
44
+ def get_local_yaml(directory: str, keep_valid_only: bool = True) -> Catalog:
45
+ """ Reads all local YAML file in a given directory and parses them as Catalog Record.
46
+
47
+ :param directory: Directory to scan
48
+ :param keep_valid_only: Only keeps passing HTR-United files
49
+
50
+ """
51
+ out = {}
52
+ for root, dirs, files in os.walk(directory):
53
+ for file in files:
54
+ if file.endswith(".yml") or file.endswith(".yaml"):
55
+ try:
56
+ data = parse_yaml(os.path.join(root, file))
57
+ out[data.get("url")] = data
58
+ except Exception as E:
59
+ logging.warning(f"Impossible to parse and understand {file}")
60
+ logger.info(str(E))
61
+ if keep_valid_only:
62
+ _clean_a_dict(out)
63
+ return out
64
+
65
+
66
+ def clever_catalog_update(catalog1: Dict, catalog2: Dict) -> Dict:
67
+ for repository in catalog2:
68
+ if repository in catalog1:
69
+ for key in ["characters", "volume"]:
70
+ if key in catalog2[repository]:
71
+ catalog1[repository][key] = catalog2[repository][key]
72
+ else:
73
+ catalog1[repository] = catalog2[repository]
74
+ return catalog1
75
+
76
+
77
+ def get_all_catalogs(
78
+ access_token: Optional[str] = None,
79
+ local_directory: Optional[str] = None,
80
+ get_distant: bool = True,
81
+ organizations: Optional[Union[str, Iterable[str]]] = "htr-united",
82
+ check_link: bool = False,
83
+ ignore_orgs_gits: List[str] = None,
84
+ keep_valid_only: bool = True,
85
+ auto_upgrade: bool = False,
86
+ citation_cff: bool = False
87
+ ) -> Catalog:
88
+ """ Retrieve repositories from various location (online, locally) and create a catalog out of the records.
89
+
90
+ :param access_token: Github Access Token to retrieve information ~ without limit from Github.com
91
+ :param local_directory: Local directory to scan for files
92
+ :param get_distant: Retrieves data from organisations on Github (Scan all their repositories)
93
+ :param organizations: Organizations to scan
94
+ :param check_link: If a local directory catalog record links to a github repository, scan the remote repository
95
+ for any updates on the catalog
96
+ :param ignore_orgs_gits: Ignore specific repositories in the scan
97
+ :param keep_valid_only: Only Keeps valid catalog record
98
+ :param auto_upgrade: Upgrade automatically all schemas to the latest version (Only applied if keep_valid_only is
99
+ True)
100
+ """
101
+ data: Catalog = {}
102
+ if local_directory:
103
+ data.update(get_local_yaml(directory=local_directory, keep_valid_only=False))
104
+ if check_link:
105
+ for uri in data:
106
+ # We update the catalog if needs be by checking each repo
107
+ if "github.com" in uri:
108
+ print(f"Fetching {uri} remotely to update metrics")
109
+ results = get_github_repo_yaml(address=uri, access_token=access_token)
110
+ if results:
111
+ data[uri] = results
112
+ if get_distant:
113
+ if isinstance(organizations, str):
114
+ organizations = (organizations, )
115
+ for orga in organizations:
116
+ data = clever_catalog_update(
117
+ data,
118
+ get_htr_united_repos(
119
+ access_token=access_token,
120
+ main_organization=orga,
121
+ exclude=ignore_orgs_gits
122
+ )
123
+ )
124
+ if keep_valid_only:
125
+ _clean_a_dict(data)
126
+ if auto_upgrade and keep_valid_only:
127
+ _upgrade_a_dict(data)
128
+ if citation_cff:
129
+ for key in data:
130
+ up = _get_bibtex_and_apa(data[key], access_token=access_token)
131
+ if up:
132
+ logger.info(f"Successfully retrieved Bibtex or/and APA for {key}")
133
+ data[key].update(up)
134
+ return dict(sorted(data.items()))
135
+
136
+
137
+ def get_statistics(repositories: Catalog) -> pandas.DataFrame:
138
+ """ Retrieve statistics from a diction of repositories
139
+
140
+ :param repositories: Dictionary of Repositories records
141
+
142
+ >>> #x = get_all_catalogs(local_directory="/home/thibault/dev/htr-united", check_link=False, get_distant=False)
143
+ >>> #get_statistics(x).groupby(by="metric").sum()
144
+ """
145
+ df = [
146
+
147
+ ]
148
+ for repository, entry in repositories.items():
149
+ try:
150
+ begin, end = entry["time"]["notBefore"], entry["time"]["notAfter"]
151
+ for a_volume in entry.get("volume", []):
152
+ df.append({
153
+ "uri": repository,
154
+ "title": entry["title"],
155
+ "start": int(begin),
156
+ "end": int(end),
157
+ "metric": a_volume["metric"].lower(),
158
+ "count": int(a_volume["count"]),
159
+ "format": entry["format"],
160
+ "script-type": entry["script-type"]
161
+ })
162
+ except KeyError:
163
+ logger.warning(f"Unable to parse {repository} for statistics")
164
+ except TypeError:
165
+ logger.warning(f"Unable to parse {repository} for statistics")
166
+ return pandas.DataFrame(df)
167
+
168
+
169
+ def group_per_year(df: pandas.DataFrame, column: Optional[str] = "metric", period: int = 50):
170
+ """ Group a column per year
171
+
172
+ >>> group_per_year(pandas.DataFrame([
173
+ ... {"start": 1300, "end": 1399, "metric": "line", "count": 134},
174
+ ... {"start": 1300, "end": 1399, "metric": "characters", "count": 234},
175
+ ... {"start": 1350, "end": 1449, "metric": "line", "count": 34},
176
+ ... {"start": 1500, "end": 1551, "metric": "line", "count": 37},
177
+ ... {"start": 1503, "end": 1504, "metric": "line", "count": 37},
178
+ ... {"start": 1300, "end": 1499, "metric": "line", "count": 2}]))
179
+ year characters line
180
+ 0 1300 234 136
181
+ 1 1350 234 170
182
+ 2 1400 0 36
183
+ 3 1450 0 2
184
+ 4 1500 0 74
185
+ 5 1550 0 37
186
+
187
+ """
188
+ new_df = [
189
+ {
190
+ "year": range_start,
191
+ **df[
192
+ df.start.between(range_start, range_start+period-1) | \
193
+ df.end.between(range_start, range_start+period-1) | \
194
+ ((df.start < range_start) & (range_start+period-1 < df.end))
195
+ ].groupby(column)["count"].sum()
196
+ }
197
+ for range_start in range(
198
+ df.start.min() // period * period,
199
+ period * (df.end.max() // period) + int(bool(df.end.max() % period)),
200
+ period
201
+ )
202
+ ]
203
+ return pandas.DataFrame(new_df).fillna(0).astype(int)
204
+
205
+
206
+ MetricLists = List[Dict[str, int]]
207
+
208
+
209
+ def update_volume(original_volume: MetricLists, metrics: MetricLists) -> Tuple[MetricLists, MetricLists]:
210
+ """ Compute the new metrics for a catalog, returns a difference list as a second output
211
+
212
+ >>> old = [{"metric": "pages", "count": 5}, {"metric": "documents", "count": 5}]
213
+ >>> new = [{"metric": "pages", "count": 10}, {"metric": "line", "count": 105}]
214
+ >>> out = update_volume(old, new)
215
+ >>> out == (
216
+ ... [{"metric": "documents", "count": 5}, {"metric": "line", "count": 105}, {"metric": "pages", "count": 10}],
217
+ ... [{"metric": "pages", "count": 5}],
218
+ ... )
219
+ True
220
+
221
+ """
222
+ old = {vol["metric"]: vol["count"] for vol in original_volume}
223
+ new = {vol["metric"]: vol["count"] for vol in metrics}
224
+ all_keys = sorted(list(set(old.keys()).union(set(new.keys()))))
225
+ diff = {key: new.get(key) - old.get(key) for key in all_keys if key in old and key in new}
226
+ return (
227
+ [{"metric": key, "count": new.get(key, old.get(key))} for key in all_keys],
228
+ [{"metric": key, "count": diff.get(key)} for key in diff]
229
+ )
230
+
231
+
232
+ def _get_bibtex_and_apa(catalog_record: CatalogRecord, access_token: Optional[str]=None) -> Dict[str, str]:
233
+ """
234
+
235
+ """
236
+ through_github = _get_github_citation_file(catalog_record, access_token)
237
+ if through_github:
238
+ return through_github
239
+
240
+ if _ZenodoRecord.search(catalog_record["url"]):
241
+ record = _ZenodoRecord.findall(catalog_record["url"])[0]
242
+ try:
243
+ req = requests.get(f"https://zenodo.org/api/records/{record}", headers={"Accept": "application/x-bibtex"})
244
+ req.raise_for_status()
245
+ return {"_bibtex": req.text}
246
+ except Exception as E:
247
+ logger.error(f"Unable to reach Zenodo for Bibtex ({catalog_record['url']}): {E}")
248
+
249
+ if "doi.org" in catalog_record["url"]:
250
+ try:
251
+ req = requests.get(catalog_record["url"], headers={"Accept": "application/x-bibtex"})
252
+ req.raise_for_status()
253
+ return {"_bibtex": req.text}
254
+ except Exception as E:
255
+ logger.error(f"Unable to reach the DOI API for Bibtex ({catalog_record['url']}): {E}")
256
+
257
+ return {}
258
+
259
+
260
+ def _get_github_citation_file(catalog_record: CatalogRecord, access_token: Optional[str] = None) -> Dict[str, str]:
261
+ if "citation-file-link" not in catalog_record and "github.com" not in catalog_record["url"]:
262
+ return {}
263
+ elif "citation-file-link" not in catalog_record:
264
+ citation_file_content = get_github_repo_cff(catalog_record["url"], access_token=access_token)
265
+ if not citation_file_content:
266
+ return {}
267
+ else: # We got a URI
268
+ try:
269
+ req = requests.get(catalog_record["citation-file-link"])
270
+ req.raise_for_status()
271
+ citation_file_content = req.text
272
+ if "</html>" in citation_file_content.lower():
273
+ raise Exception("CFF File link is wrong, it returns HTML.")
274
+ elif citation_file_content[0] == "{":
275
+ raise Exception("Got JSON at the given endpoint instead of YAML")
276
+ except Exception as E:
277
+ logger.error(f"Error retrieving CITATION File for {catalog_record['citation-file-link']}: {str(E)}")
278
+ if "github.com" in catalog_record["url"]:
279
+ logger.error(f"Trying to reach github directly")
280
+ return _get_github_citation_file({"url": catalog_record["url"]}, access_token=access_token)
281
+ return {}
282
+
283
+ try:
284
+ citation = cffconvert.Citation(citation_file_content)
285
+ except Exception as E:
286
+ logger.error(f"Unable to parse CFF for {catalog_record['url']} ({E})")
287
+ nl = "\n"
288
+ logger.error(f"Content: \n>>> {citation_file_content.replace(nl, nl+'>>> ')}")
289
+ return {}
290
+ return_obj = {}
291
+ try:
292
+ return_obj["_bibtex"] = citation.as_bibtex()
293
+ except Exception as E:
294
+ logger.error(f"Unable to parse as Bibtex {catalog_record['url']} ({E})")
295
+
296
+ try:
297
+ return_obj["_apa"] = citation.as_apalike()
298
+ except Exception as E:
299
+ logger.error(f"Unable to parse as APA {catalog_record['url']} ({E})")
300
+
301
+ return return_obj
htruc/cli.py ADDED
@@ -0,0 +1,210 @@
1
+ import sys
2
+ import click
3
+ import os.path
4
+ from ruamel.yaml import YAML
5
+ import json
6
+ from typing import Optional, List
7
+
8
+ from htruc.validator import run
9
+ from htruc.catalog import get_all_catalogs, get_statistics, group_per_year, update_volume, _get_bibtex_and_apa
10
+ from htruc.utils import parse_yaml, create_json_catalog, get_local_or_download, dump_yaml
11
+
12
+
13
+ def _error(message):
14
+ click.echo(
15
+ click.style(message, fg="red"),
16
+ color=True
17
+ )
18
+
19
+
20
+ @click.group()
21
+ def cli():
22
+ """ Interface for HTRUC """
23
+
24
+
25
+ @cli.command("test")
26
+ @click.argument("files", type=click.File(), nargs=-1)
27
+ @click.option(
28
+ "--version", type=str, default="auto", show_default=True,
29
+ help="Date of the schema version"
30
+ )
31
+ @click.option("--force-download", is_flag=True, help="Download the schema using the version provided")
32
+ def test(files, version: str, force_download: bool):
33
+ """ Test catalog files """
34
+ click.echo(f"{len(files)} to be tested")
35
+ statuses = []
36
+ if version != "auto":
37
+ version = get_local_or_download(version, force_download=force_download)
38
+ for status in run(files, schema_path=version):
39
+ statuses.append(status.status)
40
+ if status.status is False:
41
+ _error(f"☒ File `{status.filename}` testing failed")
42
+ for message in status.messages:
43
+ _error(f" {message}")
44
+ click.echo()
45
+ click.echo(
46
+ click.style(
47
+ f"{statuses.count(True)/len(statuses)*100:.2f}% of schema passed ({statuses.count(True)}/{len(statuses)})",
48
+ fg="red" if False in statuses else "green"
49
+ )
50
+ )
51
+ sys.exit(-1 if False in statuses else 0)
52
+
53
+
54
+ @cli.command("make")
55
+ @click.argument("directory", default="./catalog/")
56
+ @click.option("-o", "--organization", default=("htr-united", ), show_default=True, multiple=True,
57
+ help="Organization to retrieve repositories from")
58
+ @click.option("--remote/--no-remote", is_flag=True, default=True, show_default=True,
59
+ help="Retrieve data from remote repositories in the organization's account")
60
+ @click.option("--clean/--dirty", is_flag=True, default=True, show_default=True,
61
+ help="Keep only the valid catalog records")
62
+ @click.option("--auto-upgrade/--no-auto-upgrade", is_flag=True, default=True, show_default=True,
63
+ help="Automatically upgrade to the latest schema")
64
+ @click.option("--citation/--no-citation", is_flag=True, default=True, show_default=True,
65
+ help="Retrieve CITATION.CFF from repositories and creates unstandardized _apa and _bibtex properties "
66
+ "for each record")
67
+ @click.option("--check-link", is_flag=True, default=False, show_default=True,
68
+ help="For each github repository documented in the local files, tries to download a `htr-united.yaml`"
69
+ " file from it.")
70
+ @click.option("--output", default="catalog.yaml", show_default=True,
71
+ help="Dumps the agglutinated catalog as YAML")
72
+ @click.option("--json", default=None, show_default=True,
73
+ help="Dumps the whole catalog as JSON too")
74
+ @click.option("--graph", default=None, show_default=True,
75
+ help="Produce a graph at the path given (PNG Files please) with the amount of metrics"
76
+ "at different times")
77
+ @click.option("--graph-csv", default=None, show_default=True,
78
+ help="Outputs the data behind the graph into a CSV file")
79
+ @click.option("--access_token", default=None, show_default=True,
80
+ help="Github Access token")
81
+ @click.option("--statistics", default=None, show_default=True,
82
+ help="Produce a recap CSV file with different statistics about the period covered by the dataset")
83
+ @click.option("--ignore-repo", default=["htr-united", "template-htr-united-datarepo", "template-depot"], multiple=True, show_default=True,
84
+ help="Repos of the main organization that can be ignored")
85
+ @click.option("--ids", default="ids.json", type=click.Path(dir_okay=False), show_default=True,
86
+ help="JSON file with IDs that maps each repository URLs")
87
+ def make(directory, organization: str, access_token: Optional[str] = None, remote: bool = True,
88
+ check_link: bool = False, output: str = "catalog.yaml",
89
+ json: Optional[str] = None,
90
+ graph: Optional[str] = None,
91
+ statistics: Optional[str] = None,
92
+ graph_csv: Optional[str] = None,
93
+ ignore_repo: List[str] = None,
94
+ ids: click.File = None,
95
+ auto_upgrade: bool = True,
96
+ clean: bool = True,
97
+ citation: bool = True):
98
+ """ Generate a catalog from a main repository and an organization
99
+
100
+ """
101
+ catalog = get_all_catalogs(
102
+ access_token=access_token,
103
+ organizations=organization,
104
+ local_directory=directory,
105
+ get_distant=remote,
106
+ check_link=check_link,
107
+ ignore_orgs_gits=ignore_repo,
108
+ keep_valid_only=clean,
109
+ auto_upgrade=auto_upgrade,
110
+ citation_cff=citation
111
+ )
112
+ click.echo(f"Dumping YAML output into {output}")
113
+ with open(output, "w") as f:
114
+ dump_yaml(list(catalog.values()), f, sort_keys=False)
115
+
116
+ if json:
117
+ click.echo(f"Dumping JSON output into {json}")
118
+ from json import dump
119
+ with open(json, "w") as f:
120
+ dump(create_json_catalog(catalog, ids_files=ids), f)
121
+ if graph or statistics or graph_csv:
122
+ stats = get_statistics(catalog)
123
+ if statistics:
124
+ click.echo(f"Writing stats to {statistics}")
125
+ stats.to_csv(statistics)
126
+ if graph or graph_csv:
127
+ data = group_per_year(stats)
128
+ if graph_csv:
129
+ click.echo(f"Plotting stats to {graph_csv}")
130
+ data.to_csv(graph_csv)
131
+ if graph:
132
+ click.echo(f"Plotting {len(data.columns)-1} files with {graph} basename")
133
+ basedir, basename = os.path.dirname(graph), os.path.basename(graph)
134
+ basename = ".".join(basename.split(".")[:-1])
135
+ import matplotlib.pyplot as plot
136
+
137
+ num_axes = len(data.columns) - 1
138
+ nrows = num_axes // 2 + int(bool(num_axes % 2))
139
+ fig, axes = plot.subplots(
140
+ nrows=nrows,
141
+ ncols=2,
142
+ sharex=True,
143
+ squeeze=True,
144
+ figsize=(10, 5 * nrows),
145
+ dpi=300
146
+ )
147
+ cols = [col for col in data.columns if col != "year"]
148
+ for metric, ax in zip(cols, [c for r in axes for c in r]):
149
+ data.plot.line(x="year", y=metric, ax=ax)
150
+ fig.savefig(graph)
151
+ click.echo(f"Saved {graph}")
152
+
153
+
154
+ @cli.command("update-volumes")
155
+ @click.argument("catalog-file", type=click.File(), nargs=1)
156
+ @click.argument("metrics-json", type=click.File(), nargs=1)
157
+ @click.option(
158
+ "--inplace", type=bool, is_flag=True, default=False, show_default=True,
159
+ help="Saves the modified catalog inside the original file"
160
+ )
161
+ def catalog_volume_update(catalog_file, metrics_json, inplace):
162
+ """ Update the metrics of a file """
163
+ record = parse_yaml(catalog_file)
164
+ parsed_metrics = json.load(metrics_json)
165
+ metrics_volume = parsed_metrics["volume"]
166
+ updated, difference = update_volume(record.get("volume", []), metrics_volume)
167
+ record["volume"] = updated
168
+ for metric in difference:
169
+ if metric["count"] < 0:
170
+ click.echo(click.style(f"> The category `{metric['metric']}` decreased by {abs(metric['count'])}",
171
+ fg="yellow"))
172
+ else:
173
+ click.echo(click.style(f"> The category `{metric['metric']}` increased by {metric['count']}", fg="green"))
174
+
175
+ # Close the original file
176
+ catalog_file.close()
177
+
178
+ if record["schema"] != "2021-10-15" and "characters" in parsed_metrics:
179
+ if "characters" not in record:
180
+ record["characters"] = {}
181
+ record["characters"].update(parsed_metrics["characters"])
182
+
183
+ filename = f"{catalog_file.name}"
184
+ if not inplace:
185
+ filename = filename.split(".")
186
+ filename = ".".join([*filename[:-1], "auto-update", filename[-1]])
187
+ click.echo(f"Writing the update volumes in {filename}")
188
+ with open(filename, "w") as f:
189
+ dump_yaml(record, f, sort_keys=False)
190
+
191
+
192
+ @cli.command("upgrade")
193
+ @click.argument("files", type=click.File(), nargs=-1)
194
+ def upgrade(files):
195
+ """ Upgrade [FILES] to the latest supported schema """
196
+ for file in files:
197
+ click.echo(click.style(f"Upgrading {file.name}", fg="green"))
198
+ catalog = parse_yaml(file)
199
+ from htruc.schemas import recursive_update
200
+ catalog, upgrade_order = recursive_update(catalog)
201
+ if not upgrade_order:
202
+ click.echo(click.style(f"--> No upgrade required", fg="yellow"))
203
+ continue
204
+ file.close()
205
+ with open(file.name, "w") as f:
206
+ dump_yaml(catalog, f, sort_keys=False)
207
+
208
+
209
+ if __name__ == "__main__":
210
+ cli()
@@ -0,0 +1,2 @@
1
+ from ._generic import get_a_yaml
2
+ from ._github import get_htr_united_repos, get_github_repo_yaml, get_github_repo_cff
@@ -0,0 +1,21 @@
1
+ from typing import Optional, Dict, Any
2
+ import requests
3
+ from htruc.utils import parse_yaml
4
+
5
+
6
+ Catalog = Dict[str, Any]
7
+
8
+
9
+ def get_a_yaml(address: str, raise_on_parse_error: bool = False) -> Optional[Catalog]:
10
+ req = requests.get(address)
11
+ if req.status_code >= 400:
12
+ return None
13
+
14
+ yaml = req.text
15
+ try:
16
+ return parse_yaml(yaml)
17
+ except Exception as E:
18
+ print(f"Parse error on {address}")
19
+ if raise_on_parse_error:
20
+ raise
21
+ return None
htruc/repos/_github.py ADDED
@@ -0,0 +1,86 @@
1
+ from typing import Optional, Dict, Any, Iterable
2
+ import re
3
+
4
+ import github
5
+ from ruamel.yaml import YAML, parser
6
+ from github import Github
7
+ from github.GithubException import UnknownObjectException
8
+ from htruc.utils import parse_yaml
9
+
10
+
11
+ Catalog = Dict[str, Any]
12
+
13
+
14
+ def get_github_repo_yaml(
15
+ address: str,
16
+ access_token: Optional[str] = None,
17
+ raise_on_parse_error: bool = False) -> Optional[Catalog]:
18
+ """
19
+ >>> get_github_repo_yaml("github.com/htr-united/cremma-medieval.git")["title"]
20
+ 'Cremma Medieval'
21
+ """
22
+
23
+ user, repo_name = re.findall("github.com/([^/]+)/([^/]+)", address)[0]
24
+ if repo_name.endswith(".git"):
25
+ repo_name = repo_name[:-4]
26
+ g = Github(access_token)
27
+ try:
28
+ repo = g.get_repo(f"{user}/{repo_name}")
29
+ text = repo.get_contents("htr-united.yml").decoded_content.decode()
30
+ print("--- Found htr-united.yml")
31
+ except UnknownObjectException as e:
32
+ return None
33
+ except github.GithubException as e:
34
+ return None
35
+
36
+ try:
37
+ return parse_yaml(text)
38
+ except parser.ParserError:
39
+ print(f"Parse error on {user}/{repo_name}")
40
+ if raise_on_parse_error:
41
+ raise
42
+ return None
43
+
44
+
45
+ def get_github_repo_cff(
46
+ address: str,
47
+ access_token: Optional[str] = None) -> Optional[str]:
48
+ """
49
+
50
+ >>> get_github_repo_yaml("github.com/htr-united/cremma-medieval.git")["title"]
51
+ 'Cremma Medieval'
52
+ """
53
+
54
+ user, repo_name = re.findall("github.com/([^/]+)/([^/]+)", address)[0]
55
+ if repo_name.endswith(".git"):
56
+ repo_name = repo_name[:-4]
57
+ g = Github(access_token)
58
+ repo = g.get_repo(f"{user}/{repo_name}")
59
+ for github_content in repo.get_contents(""):
60
+ if github_content.name.lower() == "citation.cff":
61
+ try:
62
+ text = repo.get_contents(github_content.name).decoded_content.decode()
63
+ return text
64
+ except UnknownObjectException as e:
65
+ return None
66
+
67
+
68
+ def get_htr_united_repos(
69
+ access_token: Optional[str] = None,
70
+ main_organization: str = "htr-united",
71
+ exclude: Iterable[str] = ("htr-united", "template-htr-united-datarepo", )
72
+ ) -> Dict[str, Catalog]:
73
+ """ Get a single repo specific tokens
74
+
75
+ >>> get_htr_united_repos()
76
+ """
77
+ g = Github(access_token)
78
+ o = g.get_organization(main_organization)
79
+ out = {}
80
+ for repo in o.get_repos(type="public"):
81
+ if repo.name in exclude:
82
+ continue
83
+ data = get_github_repo_yaml(repo.clone_url, access_token=access_token)
84
+ if data:
85
+ out[repo.full_name] = data
86
+ return out
@@ -0,0 +1,34 @@
1
+ from typing import Dict, Tuple, Callable, AnyStr, Iterable
2
+ from htruc.schemas.upgrade_path import upgrade_2021_10_15_to_2022_04_15, upgrade_2022_04_15_to_2023_06_27
3
+ from htruc.types import SchemaVersion, CatalogRecord
4
+
5
+
6
+ # This Tuple is used to know which schema are supported
7
+ UpgradeOrder: Tuple[SchemaVersion, ...] = (
8
+ "https://htr-united.github.io/schema/2021-10-15/schema.json",
9
+ "https://htr-united.github.io/schema/2022-04-15/schema.json",
10
+ "https://htr-united.github.io/schema/2023-06-27/schema.json",
11
+ )
12
+
13
+ UpgradeFunction: Dict[str, Callable[[CatalogRecord], CatalogRecord]] = {
14
+ UpgradeOrder[0]: upgrade_2021_10_15_to_2022_04_15,
15
+ UpgradeOrder[1]: upgrade_2022_04_15_to_2023_06_27
16
+ }
17
+
18
+
19
+ def recursive_update(catalog_record: CatalogRecord) -> Tuple[CatalogRecord, Iterable[SchemaVersion]]:
20
+ """ Automatically upgrade a schema
21
+
22
+ :returns: The catalog record upgrade to the latest schema, with the list of upgrade it went through
23
+
24
+ """
25
+ version = catalog_record["schema"]
26
+ # If the version is the latest, no need to upgrade
27
+ if version == UpgradeOrder[-1]:
28
+ return catalog_record, []
29
+ current_version_index = UpgradeOrder.index(version)
30
+
31
+ for version in UpgradeOrder[current_version_index:-1]:
32
+ catalog_record = UpgradeFunction[version](catalog_record)
33
+
34
+ return catalog_record, UpgradeOrder[current_version_index:-1]