fetchm2 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,586 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ import shutil
5
+ import subprocess
6
+ from dataclasses import dataclass
7
+ from functools import lru_cache
8
+ from typing import Any
9
+
10
+ from .utils import first_present, read_package_csv, read_package_json
11
+
12
+
13
+ STANDARDIZED_COLUMNS = [
14
+ "Host_Original",
15
+ "Host_Cleaned",
16
+ "Host_SD",
17
+ "Host_TaxID",
18
+ "Host_Rank",
19
+ "Host_Superkingdom",
20
+ "Host_Phylum",
21
+ "Host_Class",
22
+ "Host_Order",
23
+ "Host_Family",
24
+ "Host_Genus",
25
+ "Host_Species",
26
+ "Host_Common_Name",
27
+ "Host_Match_Method",
28
+ "Host_Confidence",
29
+ "Host_Review_Status",
30
+ "Sample_Type_SD",
31
+ "Sample_Type_SD_Broad",
32
+ "Isolation_Source_SD",
33
+ "Isolation_Source_SD_Broad",
34
+ "Isolation_Site_SD",
35
+ "Environment_Medium_SD",
36
+ "Environment_Medium_SD_Broad",
37
+ "Environment_Broad_Scale_SD",
38
+ "Environment_Local_Scale_SD",
39
+ "Host_Disease_SD",
40
+ "Host_Health_State_SD",
41
+ "Country",
42
+ "Continent",
43
+ "Subcontinent",
44
+ "Collection_Year",
45
+ "FetchM2_Standardization_Notes",
46
+ ]
47
+
48
+ HOST_ALIASES = [
49
+ "Host",
50
+ "host",
51
+ "host scientific name",
52
+ "host_scientific_name",
53
+ "specific host",
54
+ ]
55
+ SOURCE_FIELDS = {
56
+ "Sample Type": [
57
+ "Sample Type",
58
+ "sample_type",
59
+ "sample type",
60
+ "specimen",
61
+ "sample material",
62
+ ],
63
+ "Isolation Source": [
64
+ "Isolation Source",
65
+ "isolation_source",
66
+ "isolation source",
67
+ "source",
68
+ "source type",
69
+ ],
70
+ "Isolation Site": [
71
+ "Isolation Site",
72
+ "isolation_site",
73
+ "isolation site",
74
+ "anatomical site",
75
+ "body site",
76
+ ],
77
+ "Environment Medium": [
78
+ "Environment Medium",
79
+ "env_medium",
80
+ "environmental medium",
81
+ "environment",
82
+ ],
83
+ "Environment Broad Scale": [
84
+ "Environment Broad Scale",
85
+ "env_broad_scale",
86
+ "broad-scale environmental context",
87
+ ],
88
+ "Environment Local Scale": [
89
+ "Environment Local Scale",
90
+ "env_local_scale",
91
+ "local-scale environmental context",
92
+ ],
93
+ "Host Disease": [
94
+ "Host Disease",
95
+ "host disease",
96
+ "disease",
97
+ ],
98
+ "Host Health State": [
99
+ "Host Health State",
100
+ "host health state",
101
+ "health state",
102
+ ],
103
+ }
104
+
105
+ MISSING_TOKENS = {
106
+ "",
107
+ "na",
108
+ "n/a",
109
+ "none",
110
+ "null",
111
+ "unknown",
112
+ "unk",
113
+ "missing",
114
+ "not collected",
115
+ "not applicable",
116
+ "not available",
117
+ "not reported",
118
+ "not provided",
119
+ "no data",
120
+ "no host",
121
+ "absent",
122
+ "nil",
123
+ "#ref!",
124
+ }
125
+
126
+ COUNTRY_FALSE_CONTEXT = re.compile(
127
+ r"\b(hospital|clinic|outpatient|inpatient|ward|guinea pig|norway rat|ground turkey|aspergillus niger)\b",
128
+ re.IGNORECASE,
129
+ )
130
+ DATE_YEAR_RE = re.compile(r"(19|20)\d{2}")
131
+ FOOD_PRODUCT_RE = re.compile(
132
+ r"\b(sandwich|salad|sausage|pasta|food|retail|abattoir|fillet|tenderloin|meat product)\b",
133
+ re.IGNORECASE,
134
+ )
135
+ SAMPLE_MATERIAL_RE = re.compile(
136
+ r"\b(blood|feces|faeces|stool|urine|sputum|swab|tissue|milk|saliva|lavage|pleural fluid|meat|manure)\b",
137
+ re.IGNORECASE,
138
+ )
139
+
140
+
141
+ def normalize_lookup(value: Any) -> str:
142
+ text = "" if value is None else str(value).strip().lower()
143
+ text = text.replace("_", " ").replace("-", " ")
144
+ text = re.sub(r"[()\[\]{}\"'`]+", " ", text)
145
+ text = re.sub(r"\s+", " ", text)
146
+ return text.strip(" .,:;")
147
+
148
+
149
+ def is_missing(value: Any) -> bool:
150
+ return normalize_lookup(value) in MISSING_TOKENS
151
+
152
+
153
+ @dataclass(frozen=True)
154
+ class ControlledRule:
155
+ synonym: str
156
+ source_column: str
157
+ destination: str
158
+ proposed_value: str
159
+ broad_value: str
160
+ ontology_id: str
161
+ confidence: str
162
+ method: str
163
+
164
+
165
+ @dataclass
166
+ class RuleStore:
167
+ host_exact: dict[str, tuple[str, str, str]]
168
+ host_broad: dict[str, tuple[str, str, str]]
169
+ host_negative: dict[str, str]
170
+ controlled: dict[str, list[ControlledRule]]
171
+ controlled_source_specific: dict[tuple[str, str], list[ControlledRule]]
172
+ approved_broad: dict[str, set[str]]
173
+ country_mapping: dict[str, dict[str, str]]
174
+ geography_rules: dict[str, str]
175
+
176
+
177
+ @lru_cache(maxsize=1)
178
+ def load_rules() -> RuleStore:
179
+ host_exact: dict[str, tuple[str, str, str]] = {}
180
+ host_broad: dict[str, tuple[str, str, str]] = {}
181
+ for row in read_package_csv("host_synonyms.csv"):
182
+ key = normalize_lookup(row.get("synonym"))
183
+ canonical = (row.get("canonical") or "").strip()
184
+ taxid = (row.get("taxid") or "").strip()
185
+ confidence = normalize_lookup(row.get("confidence") or "high")
186
+ if not key or not canonical or not taxid:
187
+ continue
188
+ target = host_broad if confidence == "medium" else host_exact
189
+ target[key] = (canonical, taxid, confidence or "high")
190
+
191
+ host_negative: dict[str, str] = {}
192
+ for row in read_package_csv("host_negative_rules.csv"):
193
+ key = normalize_lookup(row.get("synonym"))
194
+ decision = normalize_lookup(row.get("decision") or "non_host_source")
195
+ if key:
196
+ host_negative[key] = decision
197
+
198
+ controlled: dict[str, list[ControlledRule]] = {}
199
+ controlled_source_specific: dict[tuple[str, str], list[ControlledRule]] = {}
200
+ for row in read_package_csv("controlled_categories.csv"):
201
+ status = normalize_lookup(row.get("status") or "approved")
202
+ if status not in {"approved", "active"}:
203
+ continue
204
+ key = normalize_lookup(row.get("synonym") or row.get("original_value") or row.get("normalized_value"))
205
+ destination = (row.get("destination") or "").strip()
206
+ proposed = (row.get("proposed_value") or row.get("category") or "").strip()
207
+ if not key or not destination or not proposed:
208
+ continue
209
+ rule = ControlledRule(
210
+ synonym=key,
211
+ source_column=normalize_lookup(row.get("source_column")),
212
+ destination=destination,
213
+ proposed_value=proposed,
214
+ broad_value=(row.get("broad_value") or "").strip(),
215
+ ontology_id=(row.get("ontology_id") or "").strip(),
216
+ confidence=normalize_lookup(row.get("confidence") or "medium"),
217
+ method=normalize_lookup(row.get("method") or "dictionary"),
218
+ )
219
+ controlled.setdefault(key, []).append(rule)
220
+ if rule.source_column:
221
+ controlled_source_specific.setdefault((rule.source_column, key), []).append(rule)
222
+
223
+ approved_broad: dict[str, set[str]] = {}
224
+ for row in read_package_csv("approved_broad_categories.csv"):
225
+ field = (row.get("field") or "").strip()
226
+ value = (row.get("approved_value") or "").strip()
227
+ if field and value:
228
+ approved_broad.setdefault(field, set()).add(value)
229
+
230
+ geography_rules = {
231
+ normalize_lookup(row.get("source_value")): (row.get("country") or "").strip()
232
+ for row in read_package_csv("geography_reviewed_rules.csv")
233
+ if normalize_lookup(row.get("source_value")) and (row.get("country") or "").strip()
234
+ }
235
+ country_mapping = read_package_json("country_mapping.json")
236
+ return RuleStore(
237
+ host_exact=host_exact,
238
+ host_broad=host_broad,
239
+ host_negative=host_negative,
240
+ controlled=controlled,
241
+ controlled_source_specific=controlled_source_specific,
242
+ approved_broad=approved_broad,
243
+ country_mapping=country_mapping,
244
+ geography_rules=geography_rules,
245
+ )
246
+
247
+
248
+ COMMON_LINEAGE = {
249
+ "9606": {
250
+ "Host_Rank": "species",
251
+ "Host_Superkingdom": "Eukaryota",
252
+ "Host_Phylum": "Chordata",
253
+ "Host_Class": "Mammalia",
254
+ "Host_Order": "Primates",
255
+ "Host_Family": "Hominidae",
256
+ "Host_Genus": "Homo",
257
+ "Host_Species": "Homo sapiens",
258
+ "Host_Common_Name": "human",
259
+ },
260
+ "9913": {
261
+ "Host_Rank": "species",
262
+ "Host_Superkingdom": "Eukaryota",
263
+ "Host_Phylum": "Chordata",
264
+ "Host_Class": "Mammalia",
265
+ "Host_Order": "Artiodactyla",
266
+ "Host_Family": "Bovidae",
267
+ "Host_Genus": "Bos",
268
+ "Host_Species": "Bos taurus",
269
+ "Host_Common_Name": "cattle",
270
+ },
271
+ "9823": {
272
+ "Host_Rank": "species",
273
+ "Host_Superkingdom": "Eukaryota",
274
+ "Host_Phylum": "Chordata",
275
+ "Host_Class": "Mammalia",
276
+ "Host_Order": "Artiodactyla",
277
+ "Host_Family": "Suidae",
278
+ "Host_Genus": "Sus",
279
+ "Host_Species": "Sus scrofa",
280
+ "Host_Common_Name": "pig",
281
+ },
282
+ "9031": {
283
+ "Host_Rank": "species",
284
+ "Host_Superkingdom": "Eukaryota",
285
+ "Host_Phylum": "Chordata",
286
+ "Host_Class": "Aves",
287
+ "Host_Order": "Galliformes",
288
+ "Host_Family": "Phasianidae",
289
+ "Host_Genus": "Gallus",
290
+ "Host_Species": "Gallus gallus",
291
+ "Host_Common_Name": "chicken",
292
+ },
293
+ "8782": {
294
+ "Host_Rank": "class",
295
+ "Host_Superkingdom": "Eukaryota",
296
+ "Host_Phylum": "Chordata",
297
+ "Host_Class": "Aves",
298
+ "Host_Order": "",
299
+ "Host_Family": "",
300
+ "Host_Genus": "",
301
+ "Host_Species": "",
302
+ "Host_Common_Name": "bird",
303
+ },
304
+ "9615": {
305
+ "Host_Rank": "subspecies",
306
+ "Host_Superkingdom": "Eukaryota",
307
+ "Host_Phylum": "Chordata",
308
+ "Host_Class": "Mammalia",
309
+ "Host_Order": "Carnivora",
310
+ "Host_Family": "Canidae",
311
+ "Host_Genus": "Canis",
312
+ "Host_Species": "Canis lupus",
313
+ "Host_Common_Name": "dog",
314
+ },
315
+ "9685": {
316
+ "Host_Rank": "species",
317
+ "Host_Superkingdom": "Eukaryota",
318
+ "Host_Phylum": "Chordata",
319
+ "Host_Class": "Mammalia",
320
+ "Host_Order": "Carnivora",
321
+ "Host_Family": "Felidae",
322
+ "Host_Genus": "Felis",
323
+ "Host_Species": "Felis catus",
324
+ "Host_Common_Name": "cat",
325
+ },
326
+ }
327
+
328
+
329
+ def empty_lineage() -> dict[str, str]:
330
+ return {
331
+ "Host_Rank": "",
332
+ "Host_Superkingdom": "",
333
+ "Host_Phylum": "",
334
+ "Host_Class": "",
335
+ "Host_Order": "",
336
+ "Host_Family": "",
337
+ "Host_Genus": "",
338
+ "Host_Species": "",
339
+ "Host_Common_Name": "",
340
+ }
341
+
342
+
343
+ @lru_cache(maxsize=5000)
344
+ def taxonkit_lineage(taxid: str) -> dict[str, str]:
345
+ taxid = str(taxid or "").strip()
346
+ if taxid in COMMON_LINEAGE:
347
+ return dict(COMMON_LINEAGE[taxid])
348
+ if not taxid.isdigit() or shutil.which("taxonkit") is None:
349
+ return empty_lineage()
350
+ try:
351
+ lineage = subprocess.run(
352
+ ["taxonkit", "lineage", "-r"],
353
+ input=f"{taxid}\n",
354
+ text=True,
355
+ capture_output=True,
356
+ timeout=30,
357
+ check=False,
358
+ )
359
+ reformatted = subprocess.run(
360
+ ["taxonkit", "reformat", "-f", "{k}\t{p}\t{c}\t{o}\t{f}\t{g}\t{s}"],
361
+ input=lineage.stdout,
362
+ text=True,
363
+ capture_output=True,
364
+ timeout=30,
365
+ check=False,
366
+ )
367
+ except (OSError, subprocess.SubprocessError):
368
+ return empty_lineage()
369
+ if lineage.returncode != 0 or reformatted.returncode != 0:
370
+ return empty_lineage()
371
+ line = reformatted.stdout.splitlines()[0] if reformatted.stdout.splitlines() else ""
372
+ parts = line.split("\t")
373
+ result = empty_lineage()
374
+ if parts:
375
+ result.update(
376
+ {
377
+ "Host_Rank": (lineage.stdout.split("\t")[2].strip() if len(lineage.stdout.split("\t")) >= 3 else ""),
378
+ "Host_Phylum": parts[1].strip() if len(parts) > 1 else "",
379
+ "Host_Class": parts[2].strip() if len(parts) > 2 else "",
380
+ "Host_Order": parts[3].strip() if len(parts) > 3 else "",
381
+ "Host_Family": parts[4].strip() if len(parts) > 4 else "",
382
+ "Host_Genus": parts[5].strip() if len(parts) > 5 else "",
383
+ "Host_Species": parts[6].strip() if len(parts) > 6 else "",
384
+ }
385
+ )
386
+ return result
387
+
388
+
389
+ def host_match(value: str, *, allow_substring: bool = True) -> tuple[str, str, str, str]:
390
+ rules = load_rules()
391
+ cleaned = normalize_lookup(value)
392
+ if is_missing(cleaned):
393
+ return "", "", "missing", "none"
394
+ if cleaned in rules.host_negative:
395
+ decision = rules.host_negative[cleaned]
396
+ if decision in {"missing", "absent"}:
397
+ return "", "", "missing", "none"
398
+ if decision in {"not_identifiable", "not identifiable"}:
399
+ return "", "", "not_identifiable", "none"
400
+ return "", "", "non_host_source", "none"
401
+ if FOOD_PRODUCT_RE.search(str(value)) and not re.search(r"\b(human|patient|cattle|bovine|pig|swine|chicken)\b", str(value), re.I):
402
+ return "", "", "non_host_source", "none"
403
+ if cleaned in rules.host_exact:
404
+ name, taxid, confidence = rules.host_exact[cleaned]
405
+ return name, taxid, "dictionary", confidence
406
+ if cleaned in rules.host_broad:
407
+ name, taxid, confidence = rules.host_broad[cleaned]
408
+ return name, taxid, "broad_dictionary", confidence or "medium"
409
+ if allow_substring:
410
+ compact = f" {cleaned.replace('.', '')} "
411
+ for key, (name, taxid, confidence) in sorted(rules.host_exact.items(), key=lambda item: len(item[0]), reverse=True):
412
+ if len(key) < 3:
413
+ continue
414
+ if re.search(rf"(^|\s){re.escape(key.replace('.', ''))}(\s|$)", compact):
415
+ return name, taxid, "context_dictionary", confidence
416
+ for key, (name, taxid, confidence) in sorted(rules.host_broad.items(), key=lambda item: len(item[0]), reverse=True):
417
+ if len(key) < 4:
418
+ continue
419
+ if re.search(rf"(^|\s){re.escape(key)}(\s|$)", cleaned):
420
+ return name, taxid, "broad_dictionary", confidence or "medium"
421
+ return "", "", "review_needed", "none"
422
+
423
+
424
+ def standardize_host(row: dict[str, Any]) -> dict[str, str]:
425
+ original = first_present(row, HOST_ALIASES)
426
+ source_value = original
427
+ name, taxid, method, confidence = host_match(original, allow_substring=True)
428
+ if not taxid and method in {"missing", "non_host_source", "not_identifiable", "review_needed"}:
429
+ for aliases in SOURCE_FIELDS.values():
430
+ context_value = first_present(row, aliases)
431
+ if not context_value or is_missing(context_value):
432
+ continue
433
+ context_name, context_taxid, context_method, context_conf = host_match(context_value, allow_substring=True)
434
+ if context_taxid and SAMPLE_MATERIAL_RE.search(context_value):
435
+ name, taxid = context_name, context_taxid
436
+ method, confidence = "context_recovery", "medium"
437
+ source_value = context_value
438
+ break
439
+ cleaned = normalize_lookup(source_value)
440
+ result = {
441
+ "Host_Original": original,
442
+ "Host_Cleaned": cleaned,
443
+ "Host_SD": name,
444
+ "Host_TaxID": taxid,
445
+ "Host_Match_Method": method,
446
+ "Host_Confidence": confidence,
447
+ "Host_Review_Status": "accepted" if taxid else method,
448
+ }
449
+ result.update(empty_lineage())
450
+ if taxid:
451
+ result.update(taxonkit_lineage(taxid))
452
+ if not result.get("Host_Common_Name") and name.lower() != cleaned:
453
+ result["Host_Common_Name"] = cleaned
454
+ return result
455
+
456
+
457
+ def compress_broad_value(field: str, value: str) -> str:
458
+ value = str(value or "").strip()
459
+ if not value:
460
+ return ""
461
+ approved = load_rules().approved_broad.get(field, set())
462
+ if value in approved:
463
+ return value
464
+ key = normalize_lookup(value)
465
+ if "meat" in key:
466
+ return "food/meat"
467
+ if "dairy" in key or "milk" in key:
468
+ return "food/dairy"
469
+ if "food" in key:
470
+ return "food"
471
+ if "healthcare" in key or "hospital" in key or "clinical" in key:
472
+ return "healthcare-associated environment" if "environment" in key else "clinical/host-associated material"
473
+ if "culture" in key or "laboratory" in key:
474
+ return "culture/laboratory"
475
+ if "water" in key:
476
+ return "water"
477
+ if "soil" in key:
478
+ return "soil"
479
+ if "sediment" in key:
480
+ return "sediment"
481
+ if "environment" in key:
482
+ return "environmental material"
483
+ return value
484
+
485
+
486
+ def apply_controlled_rules(row: dict[str, Any]) -> dict[str, str]:
487
+ rules = load_rules()
488
+ output = {column: "" for column in STANDARDIZED_COLUMNS if column not in {"Country", "Continent", "Subcontinent", "Collection_Year"}}
489
+ notes: list[str] = []
490
+ for source_label, aliases in SOURCE_FIELDS.items():
491
+ value = first_present(row, aliases)
492
+ key = normalize_lookup(value)
493
+ if not key or is_missing(key):
494
+ continue
495
+ source_key = normalize_lookup(source_label)
496
+ candidates = rules.controlled_source_specific.get((source_key, key)) or rules.controlled.get(key, [])
497
+ for rule in candidates:
498
+ if rule.source_column and rule.source_column != source_key:
499
+ continue
500
+ if rule.destination == "Sample_Type_SD" and not SAMPLE_MATERIAL_RE.search(rule.proposed_value) and normalize_lookup(rule.proposed_value) in {
501
+ "human",
502
+ "patient",
503
+ "animal",
504
+ "poultry",
505
+ "cattle",
506
+ "pig",
507
+ "plant",
508
+ "bacteria",
509
+ }:
510
+ notes.append(f"skipped host-like sample type: {value}")
511
+ continue
512
+ if rule.destination in output and not output[rule.destination]:
513
+ output[rule.destination] = rule.proposed_value
514
+ broad_column = f"{rule.destination}_Broad"
515
+ if broad_column in output and rule.broad_value:
516
+ output[broad_column] = compress_broad_value(broad_column, rule.broad_value)
517
+ if source_label == "Sample Type" and key in {"pure culture", "bacterial culture", "bacteria culture", "single culture"}:
518
+ output["Sample_Type_SD"] = "pure/single culture"
519
+ if not output.get("Sample_Type_SD_Broad"):
520
+ output["Sample_Type_SD_Broad"] = "culture/laboratory"
521
+ output["FetchM2_Standardization_Notes"] = "; ".join(notes)
522
+ return output
523
+
524
+
525
+ def standardize_collection_year(row: dict[str, Any]) -> str:
526
+ value = first_present(row, ["Collection Date", "collection_date", "collection date", "Assembly Release Date"])
527
+ match = DATE_YEAR_RE.search(value)
528
+ return match.group(0) if match else ""
529
+
530
+
531
+ def standardize_geography(row: dict[str, Any]) -> dict[str, str]:
532
+ rules = load_rules()
533
+ raw = first_present(row, ["Geographic Location", "geo_loc_name", "geographic location", "Country", "country"])
534
+ if not raw or is_missing(raw) or COUNTRY_FALSE_CONTEXT.search(raw):
535
+ return {"Country": "", "Continent": "", "Subcontinent": ""}
536
+ key = normalize_lookup(raw)
537
+ if key in rules.geography_rules:
538
+ country = rules.geography_rules[key]
539
+ else:
540
+ candidate = raw.split(":", 1)[0].strip()
541
+ normalized_candidate = normalize_lookup(candidate)
542
+ country = ""
543
+ for known_country in rules.country_mapping:
544
+ if normalize_lookup(known_country) == normalized_candidate:
545
+ country = known_country
546
+ break
547
+ if not country:
548
+ for alias, canonical in {
549
+ "usa": "United States",
550
+ "us": "United States",
551
+ "u s a": "United States",
552
+ "united states of america": "United States",
553
+ "uk": "United Kingdom",
554
+ "u k": "United Kingdom",
555
+ "england": "United Kingdom",
556
+ "south korea": "South Korea",
557
+ "republic of korea": "South Korea",
558
+ }.items():
559
+ if normalized_candidate == alias:
560
+ country = canonical
561
+ break
562
+ metadata = rules.country_mapping.get(country, {})
563
+ return {
564
+ "Country": country,
565
+ "Continent": metadata.get("Continent", ""),
566
+ "Subcontinent": metadata.get("Subcontinent", ""),
567
+ }
568
+
569
+
570
+ def standardize_row(row: dict[str, Any]) -> dict[str, Any]:
571
+ output = dict(row)
572
+ standardized = {column: "" for column in STANDARDIZED_COLUMNS}
573
+ standardized.update(apply_controlled_rules(row))
574
+ standardized.update(standardize_host(row))
575
+ standardized.update(standardize_geography(row))
576
+ standardized["Collection_Year"] = standardize_collection_year(row)
577
+ if standardized["Host_SD"] and not standardized.get("Sample_Type_SD"):
578
+ sample_name, _, _, _ = host_match(first_present(row, HOST_ALIASES), allow_substring=False)
579
+ if not sample_name:
580
+ pass
581
+ output.update(standardized)
582
+ return output
583
+
584
+
585
+ def standardize_rows(rows: list[dict[str, Any]]) -> list[dict[str, Any]]:
586
+ return [standardize_row(row) for row in rows]
fetchm2/utils.py ADDED
@@ -0,0 +1,54 @@
1
+ from __future__ import annotations
2
+
3
+ import csv
4
+ import json
5
+ from importlib import resources
6
+ from pathlib import Path
7
+ from typing import Any, Iterable
8
+
9
+
10
+ def data_path(filename: str) -> Path:
11
+ return Path(str(resources.files("fetchm2.data").joinpath(filename)))
12
+
13
+
14
+ def read_package_csv(filename: str) -> list[dict[str, str]]:
15
+ path = data_path(filename)
16
+ with path.open(newline="", encoding="utf-8-sig") as handle:
17
+ return [dict(row) for row in csv.DictReader(handle)]
18
+
19
+
20
+ def read_package_json(filename: str) -> Any:
21
+ return json.loads(data_path(filename).read_text(encoding="utf-8"))
22
+
23
+
24
+ def first_present(row: dict[str, Any], names: Iterable[str]) -> str:
25
+ lower_lookup = {str(key).strip().lower(): key for key in row}
26
+ for name in names:
27
+ key = lower_lookup.get(name.strip().lower())
28
+ if key is None:
29
+ continue
30
+ value = row.get(key)
31
+ text = "" if value is None else str(value).strip()
32
+ if text:
33
+ return text
34
+ return ""
35
+
36
+
37
+ def write_csv(path: Path, rows: list[dict[str, Any]]) -> None:
38
+ path.parent.mkdir(parents=True, exist_ok=True)
39
+ fieldnames: list[str] = []
40
+ seen: set[str] = set()
41
+ for row in rows:
42
+ for key in row:
43
+ if key not in seen:
44
+ fieldnames.append(key)
45
+ seen.add(key)
46
+ with path.open("w", newline="", encoding="utf-8") as handle:
47
+ writer = csv.DictWriter(handle, fieldnames=fieldnames)
48
+ writer.writeheader()
49
+ writer.writerows(rows)
50
+
51
+
52
+ def write_text(path: Path, text: str) -> None:
53
+ path.parent.mkdir(parents=True, exist_ok=True)
54
+ path.write_text(text, encoding="utf-8")