@ca-plant-list/ca-plant-list 0.4.14 → 0.4.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,440 @@
1
+ import { scrape } from "@htmltools/scrape";
2
+ import { Files } from "../files.js";
3
+
4
+ /**
5
+ * @typedef {{id:string,name:string,common?:string,type:string,under:string}} JepsonInfo
6
+ */
7
+
8
+ const TYPES = {
9
+ EX_ALIEN: "Extirpated alien",
10
+ HYBRID_SPONT: "Spontaneous hybrid",
11
+ ILLEGITIMATE: "Illegitimate name",
12
+ INVALID: "Invalid name",
13
+ INVALID_NOTED: "Noted name", // "auct. non" = misapplied name
14
+ INVALID_SUPERFLUOUS: "Superfluous name",
15
+ MENTIONED: "Mentioned in a note",
16
+ MISAPPLIED: "Misapplied name",
17
+ MISAPP_PART: "Misapplied name, in part",
18
+ MISAPP_UNABRIDGED: "Unabridged misapplied name",
19
+ MISAPP_UNABRIDGED_PART: "Unabridged misapplied name, in part",
20
+ NATIVE: "Native",
21
+ NATIVITY_UNCERTAIN: "Native or naturalized",
22
+ NATURALIZED: "Naturalized",
23
+ POSSIBLY_IN_CA: "Possibly in ca",
24
+ SYNONYM: "Synonym",
25
+ SYN_INED: "Synonym ined.", //nomen ineditum, unpublished name; name not published or not validly published
26
+ SYN_ORTH_VARIANT: "Orthographic variant",
27
+ SYN_PART: "Synonym, in part",
28
+ SYN_PART_UN: "Unabridged synonym, in part",
29
+ WAIF: "Waif",
30
+ WAIF_EX: "Extirpated waif",
31
+ WAIF_HIST: "Historical waif",
32
+ WEED: "* weed*",
33
+ };
34
+
35
+ class JepsonEFlora {
36
+ #toolsDataPath;
37
+ #taxa;
38
+ #errorLog;
39
+ #shouldLogNotes;
40
+
41
+ /** @type {Map<string,JepsonInfo>} */
42
+ #nameInfo = new Map();
43
+ /** @type {Set<string>} */
44
+ #loadedLetters = new Set();
45
+
46
+ /**
47
+ * @param {string} toolsDataDir
48
+ * @param {Taxa} taxa
49
+ * @param {ErrorLog} errorLog
50
+ * @param {boolean} shouldLogNotes
51
+ */
52
+ constructor(toolsDataDir, taxa, errorLog, shouldLogNotes) {
53
+ this.#toolsDataPath = toolsDataDir + "/jepson-eflora";
54
+ this.#taxa = taxa;
55
+ this.#errorLog = errorLog;
56
+ this.#shouldLogNotes = shouldLogNotes;
57
+ }
58
+
59
+ /**
60
+ * @param {import("../exceptions.js").Exceptions} exceptions
61
+ */
62
+ async analyze(exceptions) {
63
+ // Create data directory if it's not there.
64
+ Files.mkdir(this.#toolsDataPath);
65
+
66
+ for (const taxon of this.#taxa.getTaxonList()) {
67
+ const name = taxon.getName();
68
+ if (name.includes(" unknown")) {
69
+ continue;
70
+ }
71
+
72
+ const jepsInfo = await this.#getJepsInfo(name);
73
+ if (jepsInfo === undefined) {
74
+ // Not found in the index.
75
+ if (!exceptions.hasException(name, "jepson", "notineflora")) {
76
+ this.#errorLog.log(name, "not found in eFlora index");
77
+ }
78
+ continue;
79
+ }
80
+
81
+ if (taxon.getJepsonID() !== jepsInfo.id) {
82
+ this.#errorLog.log(
83
+ name,
84
+ "Jepson ID does not match ID from eFlora index",
85
+ taxon.getJepsonID(),
86
+ jepsInfo.id,
87
+ );
88
+ }
89
+
90
+ if (this.#isSynonym(jepsInfo)) {
91
+ if (!exceptions.hasException(name, "jepson", "allowsynonym")) {
92
+ this.#errorLog.log(name, "is synonym for", jepsInfo.under);
93
+ }
94
+ }
95
+
96
+ const efStatus = this.#getStatusCode(jepsInfo);
97
+ const taxonStatus = taxon.getStatus();
98
+ if (
99
+ efStatus !== taxonStatus &&
100
+ !(taxonStatus === "NC" && efStatus === "N")
101
+ ) {
102
+ this.#errorLog.log(
103
+ name,
104
+ "Jepson eFlora index has different nativity status than taxa.csv",
105
+ JSON.stringify(efStatus),
106
+ taxonStatus,
107
+ );
108
+ }
109
+ }
110
+
111
+ await this.#checkSynonyms();
112
+ this.#checkExceptions(exceptions);
113
+ }
114
+
115
+ /**
116
+ * @param {import("../exceptions.js").Exceptions} exceptions
117
+ */
118
+ #checkExceptions(exceptions) {
119
+ // Check the Jepson exceptions and make sure they still apply.
120
+ for (const [name, v] of exceptions.getExceptions()) {
121
+ const exceptions = v.jepson;
122
+ if (!exceptions) {
123
+ continue;
124
+ }
125
+
126
+ // Make sure the taxon is still in our list.
127
+ const taxon = this.#taxa.getTaxon(name);
128
+ if (!taxon) {
129
+ // Don't process global exceptions if taxon is not in local list.
130
+ if (this.#taxa.isSubset() && !v.local) {
131
+ continue;
132
+ }
133
+ this.#errorLog.log(
134
+ name,
135
+ "has Jepson exceptions but is not in taxa.tsv",
136
+ );
137
+ continue;
138
+ }
139
+
140
+ for (const [k] of Object.entries(exceptions)) {
141
+ const jepsonData = this.#nameInfo.get(name);
142
+ switch (k) {
143
+ case "allowsynonym":
144
+ // Make sure it really is a synonym.
145
+ if (!this.#isSynonym(jepsonData)) {
146
+ this.#errorLog.log(
147
+ name,
148
+ "has Jepson allowsynonym exception but is not a synonym",
149
+ );
150
+ }
151
+ break;
152
+ case "notineflora":
153
+ // Make sure it is really not in eFlora.
154
+ if (jepsonData) {
155
+ this.#errorLog.log(
156
+ name,
157
+ "has Jepson notineflora exception but is in eFlora",
158
+ );
159
+ }
160
+ break;
161
+ default:
162
+ this.#errorLog.log(
163
+ name,
164
+ "unrecognized Jepson exception",
165
+ k,
166
+ );
167
+ }
168
+ }
169
+ }
170
+ }
171
+
172
+ async #checkSynonyms() {
173
+ // Make sure all synonyms in eFlora are in our list.
174
+ for (const jepsonInfo of Object.values(this.#nameInfo)) {
175
+ if (!this.#isSynonym(jepsonInfo)) {
176
+ continue;
177
+ }
178
+
179
+ const target = jepsonInfo.under;
180
+ const taxon = this.#taxa.getTaxon(target);
181
+ if (!taxon) {
182
+ // We're not tracking the target.
183
+ continue;
184
+ }
185
+
186
+ if (taxon.getSynonyms().includes(jepsonInfo.name)) {
187
+ // Already have it.
188
+ continue;
189
+ }
190
+
191
+ this.#errorLog.log(
192
+ target,
193
+ "does not have synonym",
194
+ jepsonInfo.name + "," + target,
195
+ );
196
+ }
197
+
198
+ // Make sure everything in our list is in eFlora.
199
+ for (const taxon of this.#taxa.getTaxonList()) {
200
+ for (const synonym of taxon.getSynonyms()) {
201
+ const jepsonInfo = await this.#getJepsInfo(synonym);
202
+ if (!jepsonInfo || !this.#isSynonym(jepsonInfo)) {
203
+ // Ignore iNat synonyms.
204
+ if (synonym !== taxon.getINatSyn()) {
205
+ this.#errorLog.log(
206
+ synonym,
207
+ "is in synonyms.csv but is not a synonym in eFlora",
208
+ );
209
+ }
210
+ }
211
+ }
212
+ }
213
+ }
214
+
215
+ /**
216
+ * @param {string} name
217
+ * @returns {Promise<JepsonInfo|undefined>}
218
+ */
219
+ async #getJepsInfo(name) {
220
+ const firstLetter = name[0];
221
+ // See if this index has been loaded.
222
+ if (!this.#loadedLetters.has(firstLetter)) {
223
+ await this.#loadNameIndex(firstLetter);
224
+ }
225
+
226
+ return this.#nameInfo.get(name);
227
+ }
228
+
229
+ /**
230
+ * @param {JepsonInfo} jepsInfo
231
+ * @returns {StatusCode|undefined}
232
+ */
233
+ #getStatusCode(jepsInfo) {
234
+ // If it's a synonym, return status of the target.
235
+ if (this.#isSynonym(jepsInfo)) {
236
+ const targetInfo = this.#nameInfo.get(jepsInfo.under);
237
+ if (!targetInfo) {
238
+ return;
239
+ }
240
+ return this.#getStatusCode(targetInfo);
241
+ }
242
+ switch (jepsInfo.type) {
243
+ case TYPES.NATIVE:
244
+ return "N";
245
+ case TYPES.NATIVITY_UNCERTAIN:
246
+ return "U";
247
+ default:
248
+ return "X";
249
+ }
250
+ }
251
+
252
+ /**
253
+ * @param {JepsonInfo|undefined} jepsInfo
254
+ * @returns {boolean}
255
+ */
256
+ #isSynonym(jepsInfo) {
257
+ if (!jepsInfo) {
258
+ return false;
259
+ }
260
+ switch (jepsInfo.type) {
261
+ case TYPES.SYNONYM:
262
+ return true;
263
+ }
264
+ return false;
265
+ }
266
+
267
+ /**
268
+ * @param {string} firstLetter
269
+ */
270
+ async #loadNameIndex(firstLetter) {
271
+ /**
272
+ *
273
+ * @param {string} url
274
+ * @param {string} targetFile
275
+ * @returns {Promise<Headers|undefined>}
276
+ */
277
+ async function retrieveIfNotFound(url, targetFile) {
278
+ // Retrieve file if it's not there.
279
+ if (Files.exists(targetFile)) {
280
+ return;
281
+ }
282
+ console.log("retrieving " + targetFile);
283
+ await Files.fetch(url, targetFile);
284
+ }
285
+
286
+ const fileName = "index_" + firstLetter + ".html";
287
+ const filePath = this.#toolsDataPath + "/" + fileName;
288
+ const url =
289
+ "https://ucjeps.berkeley.edu/eflora/eflora_index.php?index=" +
290
+ firstLetter;
291
+
292
+ await retrieveIfNotFound(url, filePath);
293
+
294
+ const document = scrape.parseFile(filePath);
295
+ this.#parseIndex(document);
296
+
297
+ this.#loadedLetters.add(firstLetter);
298
+ }
299
+
300
+ /**
301
+ * @param {JepsonInfo} taxonData
302
+ */
303
+ #logNotes(taxonData) {
304
+ // If we're tracking the source, log it.
305
+ if (this.#taxa.getTaxon(taxonData.name)) {
306
+ this.#errorLog.log(
307
+ taxonData.name,
308
+ "has eFlora note (as source)",
309
+ taxonData.type + " for",
310
+ taxonData.under,
311
+ );
312
+ }
313
+ // If we're tracking the target, log it.
314
+ if (this.#taxa.getTaxon(taxonData.under)) {
315
+ this.#errorLog.log(
316
+ taxonData.under,
317
+ "has eFlora note (as target)",
318
+ taxonData.type + " for",
319
+ taxonData.name,
320
+ );
321
+ }
322
+ }
323
+
324
+ /**
325
+ * @param {import("@htmltools/scrape").Root} docTree
326
+ */
327
+ #parseIndex(docTree) {
328
+ const validTypes = Object.values(TYPES);
329
+ const reUnder = /\(Under (.*)\)/;
330
+
331
+ const contentDiv = scrape.getSubtree(
332
+ docTree,
333
+ (t) => scrape.getAttr(t, "class") === "eFloraTable",
334
+ );
335
+ if (!contentDiv) {
336
+ throw new Error();
337
+ }
338
+ const rows = scrape.getSubtrees(contentDiv, (t) => t.tagName === "tr");
339
+
340
+ for (const row of rows) {
341
+ const cols = scrape.getSubtrees(row, (t) => t.tagName === "td");
342
+ if (!cols || cols.length < 3) {
343
+ continue;
344
+ }
345
+
346
+ const links = scrape.getSubtrees(cols[0], (t) => t.tagName === "a");
347
+ // Should be at least one link for a species row.
348
+ if (!links || links.length === 0) {
349
+ continue;
350
+ }
351
+
352
+ let type = scrape.getTextContent(cols[2]);
353
+ if (!type) {
354
+ // Some species are lacking a type; if it's one we're tracking, errors will show elsewhere, so ignore for now.
355
+ continue;
356
+ }
357
+
358
+ const linkText = scrape.getTextContent(links[0]);
359
+ if (!linkText.includes(" ")) {
360
+ // It's a genus name, ignore it.
361
+ continue;
362
+ }
363
+
364
+ if (type.includes(" weed")) {
365
+ type = TYPES.WEED;
366
+ }
367
+ if (!validTypes.includes(type)) {
368
+ throw new Error(
369
+ "unrecognized type for " + linkText + ": " + type,
370
+ );
371
+ }
372
+
373
+ const under = scrape.getTextContent(cols[0]);
374
+ const common = scrape.getTextContent(cols[1]);
375
+
376
+ /** @type {JepsonInfo} */
377
+ const taxonData = {};
378
+
379
+ taxonData.name = linkText;
380
+ const href = scrape.getAttr(links[0], "href");
381
+ if (!href) {
382
+ throw new Error();
383
+ }
384
+ taxonData.id = href.split("=")[1];
385
+ taxonData.type = type;
386
+ if (taxonData.common) {
387
+ taxonData.common = common;
388
+ }
389
+
390
+ if (under) {
391
+ const m = under.match(reUnder);
392
+ if (m) {
393
+ taxonData.under = m[1];
394
+ }
395
+ }
396
+
397
+ // If we're not tracking either the source or target, ignore this entry.
398
+ if (
399
+ !this.#taxa.getTaxon(taxonData.name) &&
400
+ !this.#taxa.getTaxon(taxonData.under) &&
401
+ !this.#taxa.hasSynonym(taxonData.name)
402
+ ) {
403
+ continue;
404
+ }
405
+
406
+ switch (type) {
407
+ case TYPES.ILLEGITIMATE:
408
+ case TYPES.INVALID:
409
+ case TYPES.INVALID_NOTED:
410
+ case TYPES.INVALID_SUPERFLUOUS:
411
+ case TYPES.MISAPPLIED:
412
+ case TYPES.MISAPP_PART:
413
+ case TYPES.MISAPP_UNABRIDGED:
414
+ case TYPES.SYN_INED:
415
+ case TYPES.SYN_ORTH_VARIANT:
416
+ case TYPES.SYN_PART:
417
+ case TYPES.SYN_PART_UN:
418
+ case TYPES.MENTIONED:
419
+ // Not a valid synonym or active taxon. Log it for further investigation.
420
+ if (this.#shouldLogNotes) {
421
+ this.#logNotes(taxonData);
422
+ }
423
+ continue;
424
+ }
425
+
426
+ if (this.#nameInfo.get(taxonData.name)) {
427
+ this.#errorLog.log(
428
+ taxonData.name,
429
+ "has multiple entries in eFlora",
430
+ );
431
+ // Disable the current entry, since we don't know which one is correct.
432
+ this.#nameInfo.delete(taxonData.name);
433
+ continue;
434
+ }
435
+ this.#nameInfo.set(taxonData.name, taxonData);
436
+ }
437
+ }
438
+ }
439
+
440
+ export { JepsonEFlora };