edges 1.0.2__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of edges might be problematic. Click here for more details.

Files changed (66) hide show
  1. edges/__init__.py +9 -2
  2. edges/data/AWARE 2.0_Country_all_yearly.json +8 -1
  3. edges/data/AWARE 2.0_Country_irri_yearly.json +8 -1
  4. edges/data/AWARE 2.0_Country_non_irri_yearly.json +8 -1
  5. edges/data/AWARE 2.0_Country_unspecified_yearly.json +8 -1
  6. edges/data/GeoPolRisk_paired_2024.json +7 -0
  7. edges/data/ImpactWorld+ 2.1_Freshwater acidification_damage.json +8 -1
  8. edges/data/ImpactWorld+ 2.1_Freshwater acidification_midpoint.json +8 -1
  9. edges/data/ImpactWorld+ 2.1_Freshwater ecotoxicity, long term_damage.json +8 -1
  10. edges/data/ImpactWorld+ 2.1_Freshwater ecotoxicity, short term_damage.json +8 -1
  11. edges/data/ImpactWorld+ 2.1_Freshwater ecotoxicity_midpoint.json +8 -1
  12. edges/data/ImpactWorld+ 2.1_Freshwater eutrophication_damage.json +8 -1
  13. edges/data/ImpactWorld+ 2.1_Freshwater eutrophication_midpoint.json +8 -1
  14. edges/data/ImpactWorld+ 2.1_Land occupation, biodiversity_damage.json +8 -1
  15. edges/data/ImpactWorld+ 2.1_Land occupation, biodiversity_midpoint.json +8 -1
  16. edges/data/ImpactWorld+ 2.1_Land transformation, biodiversity_damage.json +8 -1
  17. edges/data/ImpactWorld+ 2.1_Land transformation, biodiversity_midpoint.json +8 -1
  18. edges/data/ImpactWorld+ 2.1_Marine ecotoxicity, long term_damage.json +8 -1
  19. edges/data/ImpactWorld+ 2.1_Marine ecotoxicity, short term_damage.json +8 -1
  20. edges/data/ImpactWorld+ 2.1_Marine eutrophication_damage.json +8 -1
  21. edges/data/ImpactWorld+ 2.1_Marine eutrophication_midpoint.json +8 -1
  22. edges/data/ImpactWorld+ 2.1_Particulate matter formation_damage.json +8 -1
  23. edges/data/ImpactWorld+ 2.1_Particulate matter formation_midpoint.json +8 -1
  24. edges/data/ImpactWorld+ 2.1_Photochemical ozone formation, ecosystem quality_damage.json +8 -1
  25. edges/data/ImpactWorld+ 2.1_Photochemical ozone formation, human health_damage.json +8 -1
  26. edges/data/ImpactWorld+ 2.1_Photochemical ozone formation_midpoint.json +8 -1
  27. edges/data/ImpactWorld+ 2.1_Terrestrial acidification_damage.json +8 -1
  28. edges/data/ImpactWorld+ 2.1_Terrestrial acidification_midpoint.json +8 -1
  29. edges/data/ImpactWorld+ 2.1_Terrestrial ecotoxicity, long term_damage.json +8 -1
  30. edges/data/ImpactWorld+ 2.1_Terrestrial ecotoxicity, short term_damage.json +8 -1
  31. edges/data/ImpactWorld+ 2.1_Thermally polluted water_damage.json +8 -1
  32. edges/data/ImpactWorld+ 2.1_Water availability, freshwater ecosystem_damage.json +8 -1
  33. edges/data/ImpactWorld+ 2.1_Water availability, human health_damage.json +8 -1
  34. edges/data/ImpactWorld+ 2.1_Water availability, terrestrial ecosystem_damage.json +8 -1
  35. edges/data/ImpactWorld+ 2.1_Water scarcity_midpoint.json +8 -1
  36. edges/data/LCC 1.0_2023.json +8 -1
  37. edges/data/RELICS_copper_primary.json +44 -0
  38. edges/data/RELICS_copper_secondary.json +42 -0
  39. edges/data/SCP_1.0.json +4 -1
  40. edges/edgelcia.py +2113 -816
  41. edges/flow_matching.py +344 -130
  42. edges/georesolver.py +61 -2
  43. edges/supply_chain.py +2052 -0
  44. edges/uncertainty.py +37 -8
  45. {edges-1.0.2.dist-info → edges-1.0.3.dist-info}/METADATA +5 -2
  46. edges-1.0.3.dist-info/RECORD +57 -0
  47. edges/data/GeoPolRisk_elementary flows_2024.json +0 -877
  48. edges/data/ImpactWorld+ 2.1_Freshwater ecotoxicity, long term_midpoint.json +0 -5
  49. edges/data/ImpactWorld+ 2.1_Freshwater ecotoxicity, short term_midpoint.json +0 -5
  50. edges/data/ImpactWorld+ 2.1_Freshwater ecotoxicity_damage.json +0 -0
  51. edges/data/ImpactWorld+ 2.1_Marine ecotoxicity, long term_midpoint.json +0 -5
  52. edges/data/ImpactWorld+ 2.1_Marine ecotoxicity, short term_midpoint.json +0 -5
  53. edges/data/ImpactWorld+ 2.1_Photochemical ozone formation, ecosystem quality_midpoint.json +0 -5
  54. edges/data/ImpactWorld+ 2.1_Photochemical ozone formation, human health_midpoint.json +0 -5
  55. edges/data/ImpactWorld+ 2.1_Photochemical ozone formation_damage.json +0 -5
  56. edges/data/ImpactWorld+ 2.1_Terrestrial ecotoxicity, long term_midpoint.json +0 -5
  57. edges/data/ImpactWorld+ 2.1_Terrestrial ecotoxicity, short term_midpoint.json +0 -5
  58. edges/data/ImpactWorld+ 2.1_Thermally polluted water_midpoint.json +0 -5
  59. edges/data/ImpactWorld+ 2.1_Water availability, freshwater ecosystem_midpoint.json +0 -5
  60. edges/data/ImpactWorld+ 2.1_Water availability, human health_midpoint.json +0 -5
  61. edges/data/ImpactWorld+ 2.1_Water availability, terrestrial ecosystem_midpoint.json +0 -5
  62. edges/data/ImpactWorld+ 2.1_Water scarcity_damage.json +0 -5
  63. edges/data/RELICS_copper.json +0 -22
  64. edges-1.0.2.dist-info/RECORD +0 -71
  65. {edges-1.0.2.dist-info → edges-1.0.3.dist-info}/WHEEL +0 -0
  66. {edges-1.0.2.dist-info → edges-1.0.3.dist-info}/top_level.txt +0 -0
edges/flow_matching.py CHANGED
@@ -5,7 +5,7 @@ from copy import deepcopy
5
5
  import json, time
6
6
  from typing import NamedTuple, List, Optional
7
7
 
8
- from .utils import make_hashable, _short_cf, _head
8
+ from edges.utils import make_hashable, _short_cf, _head
9
9
 
10
10
 
11
11
  import logging
@@ -56,6 +56,14 @@ def process_cf_list(
56
56
  filtered_supplier: dict,
57
57
  filtered_consumer: dict,
58
58
  ) -> list:
59
+ """
60
+ Select the best-matching CF from a candidate list given supplier/consumer filters.
61
+
62
+ :param cf_list: List of candidate CF dictionaries.
63
+ :param filtered_supplier: Supplier-side fields to match against.
64
+ :param filtered_consumer: Consumer-side fields to match against.
65
+ :return: List with the single best CF (or empty if none matched).
66
+ """
59
67
  results = []
60
68
  best_score = -1
61
69
  best_cf = None
@@ -69,7 +77,7 @@ def process_cf_list(
69
77
  criteria=supplier_cf,
70
78
  )
71
79
 
72
- if supplier_match is False:
80
+ if not supplier_match:
73
81
  continue
74
82
 
75
83
  consumer_match = match_flow(
@@ -77,7 +85,7 @@ def process_cf_list(
77
85
  criteria=consumer_cf,
78
86
  )
79
87
 
80
- if consumer_match is False:
88
+ if not consumer_match:
81
89
  continue
82
90
 
83
91
  match_score = 0
@@ -98,9 +106,10 @@ def process_cf_list(
98
106
  if match_score > best_score:
99
107
  best_score = match_score
100
108
  best_cf = cf
109
+ if best_score == 2:
110
+ break
101
111
 
102
112
  if best_cf:
103
- logger.debug("Best matching CF selected with score %d: %s", best_score, best_cf)
104
113
  results.append(best_cf)
105
114
  else:
106
115
  logger.debug(
@@ -113,7 +122,13 @@ def process_cf_list(
113
122
 
114
123
 
115
124
  def matches_classifications(cf_classifications, dataset_classifications):
116
- """Match CF classification codes to dataset classifications."""
125
+ """
126
+ Check if CF classification codes match dataset classifications (prefix logic).
127
+
128
+ :param cf_classifications: CF-side classifications (dict or list/tuple).
129
+ :param dataset_classifications: Dataset classifications as list/tuple pairs.
130
+ :return: True if at least one scheme/code pair matches by prefix, else False.
131
+ """
117
132
 
118
133
  if isinstance(cf_classifications, dict):
119
134
  cf_classifications = [
@@ -148,6 +163,14 @@ def matches_classifications(cf_classifications, dataset_classifications):
148
163
 
149
164
 
150
165
  def match_flow(flow: dict, criteria: dict) -> bool:
166
+ """
167
+ Match a flow dictionary against criteria with operator and exclude support.
168
+
169
+ :param flow: Flow metadata to test.
170
+ :param criteria: Matching criteria (fields, operator, excludes, classifications).
171
+ :return: True if all non-special fields match, else False.
172
+ """
173
+
151
174
  operator = criteria.get("operator", "equals")
152
175
  excludes = criteria.get("excludes", [])
153
176
 
@@ -215,7 +238,12 @@ def match_operator(value: str, target: str, operator: str) -> bool:
215
238
 
216
239
 
217
240
  def normalize_classification_entries(cf_list: list[dict]) -> list[dict]:
241
+ """
242
+ Normalize supplier-side 'classifications' to a flat tuple of (scheme, code).
218
243
 
244
+ :param cf_list: List of CF dictionaries to normalize in-place.
245
+ :return: The same list with normalized supplier classifications.
246
+ """
219
247
  for cf in cf_list:
220
248
  supplier = cf.get("supplier", {})
221
249
  classifications = supplier.get("classifications")
@@ -244,8 +272,10 @@ def normalize_classification_entries(cf_list: list[dict]) -> list[dict]:
244
272
 
245
273
  def build_cf_index(raw_cfs: list[dict]) -> dict:
246
274
  """
247
- Build a nested CF index:
248
- cf_index[(supplier_loc, consumer_loc)] → list of CFs
275
+ Build a CF index keyed by (supplier_location, consumer_location).
276
+
277
+ :param raw_cfs: List of CF dictionaries.
278
+ :return: Dict mapping (supplier_loc, consumer_loc) -> list of CFs.
249
279
  """
250
280
  index = defaultdict(list)
251
281
 
@@ -262,6 +292,7 @@ def build_cf_index(raw_cfs: list[dict]) -> dict:
262
292
  def cached_match_with_index(flow_to_match_hashable, required_fields_tuple):
263
293
  flow_to_match = dict(flow_to_match_hashable)
264
294
  required_fields = set(required_fields_tuple)
295
+ # the contexts live on the function as attributes
265
296
  return match_with_index(
266
297
  flow_to_match,
267
298
  cached_match_with_index.index,
@@ -273,9 +304,11 @@ def cached_match_with_index(flow_to_match_hashable, required_fields_tuple):
273
304
 
274
305
  def preprocess_flows(flows_list: list, mandatory_fields: set) -> dict:
275
306
  """
276
- Preprocess flows into a lookup dictionary.
277
- Each flow is keyed by a tuple of selected metadata fields.
278
- If no fields are present, falls back to a single universal key ().
307
+ Preprocess flows into a lookup dict keyed by selected metadata fields.
308
+
309
+ :param flows_list: Iterable of flow dicts with at least a 'position' key.
310
+ :param mandatory_fields: Set of fields to include in the key (may be empty).
311
+ :return: Dict where key is a tuple of (field, value) and value is list of positions.
279
312
  """
280
313
  lookup = {}
281
314
 
@@ -328,6 +361,12 @@ def build_index(lookup: dict, required_fields: set) -> dict:
328
361
 
329
362
 
330
363
  class MatchResult(NamedTuple):
364
+ """Result container for indexed matching.
365
+
366
+ :var matches: List of matched positions.
367
+ :var location_only_rejects: Map of position -> reason ("location").
368
+ """
369
+
331
370
  matches: List[int]
332
371
  location_only_rejects: dict[int, str]
333
372
 
@@ -339,6 +378,9 @@ def match_with_index(
339
378
  required_fields: set,
340
379
  reversed_lookup: dict,
341
380
  ) -> MatchResult:
381
+ """
382
+ Match a flow to positions using a per-field inverted index and full criteria.
383
+ """
342
384
  SPECIAL = {"excludes", "operator", "matrix"}
343
385
  nonloc_fields = [f for f in required_fields if f not in SPECIAL and f != "location"]
344
386
  has_location_constraint = ("location" in required_fields) and (
@@ -382,14 +424,46 @@ def match_with_index(
382
424
  if not keys:
383
425
  return []
384
426
  out = []
427
+ # Fast path: no excludes -> everything in these keys already matches
428
+ excludes = ft_for_matchflow.get("excludes")
429
+ if not excludes:
430
+ for key in keys:
431
+ # lookup_mapping[key] is the list of positions for this composite key
432
+ bucket = lookup_mapping.get(key)
433
+ if bucket:
434
+ out.extend(bucket)
435
+ return out
436
+
437
+ # Slow path: excludes present -> filter per-record once
438
+ # Normalize excludes for faster checks
439
+ ex = tuple(e.lower() for e in (excludes or ()))
385
440
  for key in keys:
386
- for pos in lookup_mapping.get(key, []):
441
+ bucket = lookup_mapping.get(key)
442
+ if not bucket:
443
+ continue
444
+ for pos in bucket:
387
445
  raw = reversed_lookup[pos]
388
446
  flow = dict(raw) if isinstance(raw, tuple) else raw
389
- if flow and match_flow(flow, ft_for_matchflow):
390
- out.append(pos)
447
+ # Only scan string fields; short-circuit early
448
+ if any(
449
+ isinstance(v, str) and any(e in v.lower() for e in ex)
450
+ for v in flow.values()
451
+ ):
452
+ continue
453
+ out.append(pos)
391
454
  return out
392
455
 
456
+ def intersect_smallest_first(sets_iterable):
457
+ sets_list = [s for s in sets_iterable if s is not None]
458
+ if not sets_list:
459
+ return set()
460
+ acc = min(sets_list, key=len).copy()
461
+ for s in sorted((x for x in sets_list if x is not acc), key=len):
462
+ acc &= s
463
+ if not acc:
464
+ break
465
+ return acc
466
+
393
467
  # --- SPECIAL CASE: only 'location' is required ---
394
468
  if not nonloc_fields and has_location_constraint:
395
469
  all_keys = set(lookup_mapping.keys())
@@ -413,22 +487,28 @@ def match_with_index(
413
487
 
414
488
  # --- NORMAL PATH: there are non-location required fields ---
415
489
  if nonloc_fields:
416
- pre_location_keys = None
490
+ # Build candidate key sets per non-location field
491
+ per_field_sets = []
417
492
  for field in nonloc_fields:
418
493
  cand = field_candidates(field, flow_to_match.get(field), op)
419
- pre_location_keys = (
420
- cand if pre_location_keys is None else (pre_location_keys & cand)
421
- )
422
- if not pre_location_keys:
494
+ if not cand:
495
+ # Any empty set means no matches possible
423
496
  return MatchResult(matches=[], location_only_rejects={})
497
+ per_field_sets.append(cand)
498
+
499
+ # Intersect smallest-first for speed
500
+ pre_location_keys = intersect_smallest_first(per_field_sets)
501
+ if not pre_location_keys:
502
+ return MatchResult(matches=[], location_only_rejects={})
424
503
  else:
425
- # no required fields at all
504
+ # no required fields at all → start from all keys
426
505
  pre_location_keys = set(lookup_mapping.keys())
427
506
 
428
- # apply location last
507
+ # Apply location as an extra filter (kept separate to preserve location-only diagnostics)
429
508
  candidate_keys = pre_location_keys
430
509
  if has_location_constraint:
431
510
  loc_cand = field_candidates("location", flow_to_match.get("location"), op)
511
+ # Intersect with location last (fast set op on already reduced key-space)
432
512
  candidate_keys = pre_location_keys & loc_cand
433
513
 
434
514
  # noloc matches (for diagnosing location-only)
@@ -440,7 +520,7 @@ def match_with_index(
440
520
  full_matches = gather_positions(candidate_keys, flow_to_match)
441
521
 
442
522
  loc_only = (
443
- set(noloc_matches) - set(full_matches) if has_location_constraint else set()
523
+ (set(noloc_matches) - set(full_matches)) if has_location_constraint else set()
444
524
  )
445
525
 
446
526
  return MatchResult(
@@ -450,8 +530,17 @@ def match_with_index(
450
530
 
451
531
 
452
532
  def compute_cf_memoized_factory(
453
- cf_index, required_supplier_fields, required_consumer_fields, weights
533
+ cf_index, required_supplier_fields, required_consumer_fields
454
534
  ):
535
+ """
536
+ Factory for a memoized compute_average_cf over signature/location candidates.
537
+
538
+ :param cf_index: CF index keyed by (supplier_loc, consumer_loc).
539
+ :param required_supplier_fields: Required fields for supplier signature.
540
+ :param required_consumer_fields: Required fields for consumer signature.
541
+ :return: Cached function(s_key, c_key, supplier_candidates, consumer_candidates) -> tuple.
542
+ """
543
+
455
544
  @lru_cache(maxsize=None)
456
545
  def compute_cf(s_key, c_key, supplier_candidates, consumer_candidates):
457
546
  return compute_average_cf(
@@ -468,6 +557,14 @@ def compute_cf_memoized_factory(
468
557
 
469
558
 
470
559
  def normalize_signature_data(info_dict, required_fields):
560
+ """
561
+ Filter and normalize a dict to required fields for signature hashing.
562
+
563
+ :param info_dict: Original supplier/consumer info dict.
564
+ :param required_fields: Required field names to keep.
565
+ :return: Filtered dict with normalized 'classifications' if present.
566
+ """
567
+
471
568
  filtered = {k: info_dict[k] for k in required_fields if k in info_dict}
472
569
 
473
570
  # Normalize classifications
@@ -501,49 +598,76 @@ def normalize_signature_data(info_dict, required_fields):
501
598
  return filtered
502
599
 
503
600
 
504
- @lru_cache(maxsize=None)
601
+ @lru_cache(maxsize=4096)
602
+ def _available_locs_from_weights(weights_key_tuple: tuple, supplier: bool) -> tuple:
603
+ """
604
+ Project available locations from a stable weights key.
605
+ weights_key_tuple is a tuple of (supplier_loc, consumer_loc) pairs.
606
+ Returns a sorted, de-duplicated tuple of allowed codes for the given side.
607
+ """
608
+ if supplier:
609
+ vals = {w[0] for w in weights_key_tuple}
610
+ else:
611
+ vals = {w[1] for w in weights_key_tuple}
612
+ # Keep deterministic order; don't special-case __ANY__ here
613
+ return tuple(sorted(vals))
614
+
615
+
616
+ @lru_cache(maxsize=200_000)
505
617
  def resolve_candidate_locations(
506
618
  *,
507
619
  geo,
508
620
  location: str,
509
- weights: frozenset,
621
+ weights: tuple,
510
622
  containing: bool = False,
511
- exceptions: set = None,
623
+ exceptions: tuple | None = None, # <— changed: tuple for caching
512
624
  supplier: bool = True,
513
- ) -> list:
625
+ ) -> tuple:
514
626
  """
515
- Resolve candidate consumer locations from a base location.
516
-
517
- Parameters:
518
- - geo: GeoResolver instance
519
- - location: base location string (e.g., "GLO", "CH")
520
- - weights: valid weight region codes
521
- - containing: if True, return regions containing the location;
522
- if False, return regions contained by the location
523
- - exceptions: list of regions to exclude (used with GLO fallback)
524
-
525
- Returns:
526
- - list of valid candidate location codes
627
+ Cached candidate resolver:
628
+ - derives available locations once per weights_key_tuple + side
629
+ - filters inside (including dropping 'GLO' when expanding GLO) to avoid extra list comps in hot loops
630
+ - returns a tuple (hashable, deterministic)
527
631
  """
528
632
  try:
633
+ exceptions = list(exceptions) if exceptions else []
529
634
  candidates = geo.resolve(
530
- location=location,
531
- containing=containing,
532
- exceptions=exceptions or [],
635
+ location=location, containing=containing, exceptions=exceptions
533
636
  )
534
637
  except KeyError:
535
- return []
638
+ return tuple()
639
+
640
+ # When expanding GLO to its contained regions, drop 'GLO' itself here
641
+ if containing and isinstance(location, str) and location == "GLO":
642
+ candidates = [c for c in candidates if c != "GLO"]
536
643
 
537
- if supplier is True:
538
- available_locs = [loc[0] for loc in weights]
644
+ avail = _available_locs_from_weights(weights, supplier=supplier)
645
+
646
+ # If wildcard is allowed on this side, we don't filter candidates by availability
647
+ if "__ANY__" in avail:
648
+ pool = candidates
539
649
  else:
540
- available_locs = [loc[1] for loc in weights]
541
- return [loc for loc in candidates if loc in available_locs]
650
+ # avail is small; convert to set once for O(1) membership
651
+ a = set(avail)
652
+ pool = [loc for loc in candidates if loc in a]
653
+
654
+ # Deterministic ordering across platforms
655
+ # If you still want 'GLO' first (we dropped it above for GLO-expansion),
656
+ # keep the same policy for non-GLO locations
657
+ return tuple(sorted(set(pool)))
542
658
 
543
659
 
544
660
  def group_edges_by_signature(
545
661
  edge_list, required_supplier_fields, required_consumer_fields
546
662
  ):
663
+ """
664
+ Group edges by (supplier signature, consumer signature, candidate locations).
665
+
666
+ :param edge_list: Iterable of (s_idx, c_idx, s_info, c_info, s_cands, c_cands).
667
+ :param required_supplier_fields: Supplier fields required for signature.
668
+ :param required_consumer_fields: Consumer fields required for signature.
669
+ :return: Dict[(s_key, c_key, (s_cands, c_cands))] -> list of (s_idx, c_idx).
670
+ """
547
671
  grouped = defaultdict(list)
548
672
 
549
673
  for (
@@ -567,12 +691,15 @@ def group_edges_by_signature(
567
691
 
568
692
  grouped[(s_key, c_key, loc_key)].append((supplier_idx, consumer_idx))
569
693
 
694
+ for _k in grouped:
695
+ grouped[_k].sort()
696
+
570
697
  return grouped
571
698
 
572
699
 
573
700
  def compute_average_cf(
574
- candidate_suppliers: list,
575
- candidate_consumers: list,
701
+ candidate_suppliers: list | tuple,
702
+ candidate_consumers: list | tuple,
576
703
  supplier_info: dict,
577
704
  consumer_info: dict,
578
705
  cf_index: dict,
@@ -580,12 +707,62 @@ def compute_average_cf(
580
707
  required_consumer_fields: set = None,
581
708
  ) -> tuple[str | float, Optional[dict], Optional[dict]]:
582
709
  """
583
- Compute weighted CF and a canonical aggregated uncertainty for composite regions.
584
- Returns: (expr_or_value, matched_cf_obj|None, agg_uncertainty|None)
710
+ Compute a weighted CF expression and aggregated uncertainty for composite regions.
711
+ Deterministic across platforms without deep freezing: we sort by (s_loc, c_loc, cf_signature),
712
+ where cf_signature is a compact, shallow tuple of stable fields.
585
713
  """
586
- # Optional timing (only if DEBUG)
587
714
  _t0 = time.perf_counter() if logger.isEnabledFor(logging.DEBUG) else None
588
715
 
716
+ # ---- compact, shallow signatures (no deep recursion) ----
717
+ # Keep only a few stable fields that define semantics; fall back to repr for odd types.
718
+ def _cf_signature(cf: dict) -> tuple:
719
+ # Pull once to locals (avoid many dict.get calls)
720
+ # Choose a small set of fields that make equal CFs sort adjacent/stably
721
+ v = cf.get("value")
722
+ w = cf.get("weight")
723
+ u = cf.get("unit")
724
+ sym = cf.get("symbolic") # expression or None
725
+ # If there is an explicit identifier, prefer it for stability
726
+ cfid = cf.get("id") or cf.get("code") or None
727
+ # Normalize numerics; avoid touching nested dicts/lists
728
+ try:
729
+ v_norm = float(v) if isinstance(v, (int, float)) else repr(v)
730
+ except Exception:
731
+ v_norm = repr(v)
732
+ try:
733
+ w_norm = (
734
+ float(w)
735
+ if isinstance(w, (int, float))
736
+ else (0.0 if w in (None, "", False) else 0.0)
737
+ )
738
+ except Exception:
739
+ w_norm = 0.0
740
+ return (cfid, v_norm, u or "", bool(sym))
741
+
742
+ def _unc_signature(unc: dict | None) -> tuple:
743
+ if not unc:
744
+ return ("",)
745
+ dist = unc.get("distribution", "")
746
+ neg = unc.get("negative", None)
747
+ # Shallow, order-stable snapshot of top-level parameters only
748
+ params = unc.get("parameters")
749
+ if isinstance(params, dict):
750
+ # Only sort top-level keys; values kept as-is (repr) to avoid deep cost
751
+ par_sig = tuple(sorted((k, repr(params[k])) for k in params.keys()))
752
+ else:
753
+ par_sig = repr(params)
754
+ return (
755
+ dist,
756
+ 1 if neg in (1, True) else 0 if neg in (0, False) else -1,
757
+ par_sig,
758
+ )
759
+
760
+ # ---------- 1) Canonicalize candidate pools (once) ----------
761
+ if not isinstance(candidate_suppliers, tuple):
762
+ candidate_suppliers = tuple(set(candidate_suppliers))
763
+ if not isinstance(candidate_consumers, tuple):
764
+ candidate_consumers = tuple(set(candidate_consumers))
765
+
589
766
  if not candidate_suppliers and not candidate_consumers:
590
767
  logger.warning(
591
768
  "CF-AVG: no candidate locations provided | supplier_cands=%s | consumer_cands=%s",
@@ -594,59 +771,60 @@ def compute_average_cf(
594
771
  )
595
772
  return 0, None, None
596
773
 
597
- # -------- Gate 1: location-key presence in cf_index --------
598
- valid_location_pairs = [
599
- (s, c)
600
- for s in candidate_suppliers
601
- for c in candidate_consumers
602
- if cf_index.get((s, c))
603
- ]
774
+ S = candidate_suppliers
775
+ C = candidate_consumers
776
+ setS, setC = set(S), set(C)
777
+
778
+ # ---------- 2) Efficient valid (s,c) pair discovery ----------
779
+ idx_keys = cf_index.keys()
780
+ prod_size = len(S) * len(C)
781
+ if prod_size and prod_size <= len(idx_keys):
782
+ valid_location_pairs = [(s, c) for s in S for c in C if (s, c) in cf_index]
783
+ # S and C are already sorted; this is lexicographically ordered
784
+ else:
785
+ valid_location_pairs = [k for k in idx_keys if k[0] in setS and k[1] in setC]
786
+ valid_location_pairs.sort()
604
787
 
605
788
  if not valid_location_pairs:
606
789
  if logger.isEnabledFor(logging.DEBUG):
607
- # show small sample of what keys do exist for quick diagnosis
608
- some_keys = _head(cf_index.keys(), 10)
790
+ some_keys = _head(idx_keys, 10)
609
791
  logger.debug(
610
792
  "CF-AVG: no (supplier,consumer) keys in cf_index for candidates "
611
793
  "| suppliers=%s | consumers=%s | sample_index_keys=%s",
612
- _head(candidate_suppliers),
613
- _head(candidate_consumers),
794
+ _head(S),
795
+ _head(C),
614
796
  some_keys,
615
797
  )
616
798
  return 0, None, None
617
- else:
618
- if logger.isEnabledFor(logging.DEBUG):
619
- logger.debug(
620
- "CF-AVG: %d valid (s,c) keys found (showing up to 10): %s",
621
- len(valid_location_pairs),
622
- _head(valid_location_pairs, 10),
623
- )
624
799
 
625
- # Build field-filtered views (exclude location; added per-loop)
626
- filtered_supplier = {
800
+ # ---------- 3) Base, field-filtered views (exclude 'location' here) ----------
801
+ required_supplier_fields = required_supplier_fields or set()
802
+ required_consumer_fields = required_consumer_fields or set()
803
+
804
+ base_supplier = {
627
805
  k: supplier_info[k]
628
- for k in (required_supplier_fields or ())
806
+ for k in required_supplier_fields
629
807
  if k in supplier_info and k != "location"
630
808
  }
631
- filtered_consumer = {
809
+ base_consumer = {
632
810
  k: consumer_info[k]
633
- for k in (required_consumer_fields or ())
811
+ for k in required_consumer_fields
634
812
  if k in consumer_info and k != "location"
635
813
  }
636
814
 
637
- # -------- Gate 2: field/operator/classification match --------
638
- matched = []
815
+ # ---------- 4) Field/operator/classification match ----------
816
+ matched: list[tuple[str, str, dict]] = []
639
817
  total_candidates_seen = 0
640
818
 
641
819
  for s_loc, c_loc in valid_location_pairs:
642
820
  cands = cf_index.get((s_loc, c_loc)) or []
643
821
  total_candidates_seen += len(cands)
644
822
 
645
- filtered_supplier["location"] = s_loc
646
- filtered_consumer["location"] = c_loc
823
+ fs = {**base_supplier, "location": s_loc}
824
+ fc = {**base_consumer, "location": c_loc}
647
825
 
648
- got = process_cf_list(cands, filtered_supplier, filtered_consumer)
649
- if logger.isEnabledFor(logging.DEBUG) and got:
826
+ got = process_cf_list(cands, fs, fc)
827
+ if got and logger.isEnabledFor(logging.DEBUG):
650
828
  logger.debug(
651
829
  "CF-AVG: matched %d/%d CFs @ (%s,%s); example=%s",
652
830
  len(got),
@@ -655,7 +833,8 @@ def compute_average_cf(
655
833
  c_loc,
656
834
  _short_cf(got[0]),
657
835
  )
658
- matched.extend(got)
836
+ for cf in got:
837
+ matched.append((s_loc, c_loc, cf))
659
838
 
660
839
  if not matched:
661
840
  if logger.isEnabledFor(logging.DEBUG):
@@ -669,37 +848,54 @@ def compute_average_cf(
669
848
  )
670
849
  return 0, None, None
671
850
 
672
- # Weights
673
- total_w = sum(cf.get("weight", 0.0) for cf in matched)
674
- if total_w == 0:
675
- logger.warning(
676
- "CF-AVG: weights all zero/missing using equal shares | matched=%d | example=%s",
677
- len(matched),
678
- _short_cf(matched[0]) if matched else None,
679
- )
680
- matched_cfs = [(cf, 1.0 / len(matched)) for cf in matched]
851
+ # ---------- 5) Deterministic ordering without deep freezing ----------
852
+ matched.sort(key=lambda t: (t[0], t[1], _cf_signature(t[2])))
853
+
854
+ # ---------- 6) Build and normalize weights ----------
855
+ # Pull weights once; avoid repeated cf.get in loops
856
+ weights = []
857
+ for _s, _c, cf in matched:
858
+ w = cf.get("weight", 0.0)
859
+ try:
860
+ w = float(w)
861
+ except Exception:
862
+ w = 0.0
863
+ if not np.isfinite(w) or w < 0.0:
864
+ w = 0.0
865
+ weights.append(w)
866
+
867
+ w_arr = np.asarray(weights, dtype=np.float64)
868
+ w_sum = float(w_arr.sum(dtype=np.float64))
869
+ n_m = len(matched)
870
+
871
+ if w_sum <= 0.0:
872
+ shares = np.full(n_m, 1.0 / n_m, dtype=np.float64)
873
+ if logger.isEnabledFor(logging.DEBUG):
874
+ logger.debug(
875
+ "CF-AVG: weights all zero/missing → using equal shares | matched=%d | example=%s",
876
+ n_m,
877
+ _short_cf(matched[0][2]) if matched else None,
878
+ )
681
879
  else:
682
- matched_cfs = [(cf, cf.get("weight", 0.0) / total_w) for cf in matched]
683
-
684
- # Safety check on weights; log before assert explodes
685
- share_sum = sum(s for _, s in matched_cfs)
686
- if logger.isEnabledFor(logging.DEBUG):
687
- logger.debug(
688
- "CF-AVG: matched=%d | sum_shares=%.6f | example=%s",
689
- len(matched_cfs),
690
- share_sum,
691
- _short_cf(matched_cfs[0][0]) if matched_cfs else None,
880
+ shares = w_arr / w_sum
881
+ # prune tiny contributions to stabilize representation
882
+ shares = np.where(shares < 1e-4, 0.0, shares)
883
+ ssum = float(shares.sum(dtype=np.float64))
884
+ shares = (
885
+ (shares / ssum) if ssum > 0.0 else np.full(n_m, 1.0 / n_m, dtype=np.float64)
692
886
  )
693
887
 
694
- assert np.isclose(share_sum, 1.0), f"Total shares must equal 1. Got: {share_sum}"
695
-
696
- # Build deterministic expression (string)
697
- expressions = [f"({share:.3f} * ({cf['value']}))" for cf, share in matched_cfs]
888
+ # ---------- 7) Expression assembly (uses matched order) ----------
889
+ # Use shallow value access (no deep repr/formatting)
890
+ expressions = []
891
+ for (_s, _c, cf), sh in zip(matched, shares):
892
+ if sh > 0.0:
893
+ expressions.append(f"({sh:.4f} * ({cf.get('value')}))")
698
894
  expr = " + ".join(expressions)
699
895
 
700
- # Single CF shortcut (pass-through uncertainty)
701
- if len(matched_cfs) == 1:
702
- single_cf = matched_cfs[0][0]
896
+ # ---------- 8) Single CF shortcut ----------
897
+ if len(matched) == 1:
898
+ single_cf = matched[0][2]
703
899
  agg_uncertainty = single_cf.get("uncertainty")
704
900
  if logger.isEnabledFor(logging.DEBUG):
705
901
  dt = (time.perf_counter() - _t0) if _t0 else None
@@ -711,9 +907,10 @@ def compute_average_cf(
711
907
  )
712
908
  return (expr, single_cf, agg_uncertainty)
713
909
 
714
- # Multi-CF aggregated uncertainty
910
+ # ---------- 9) Aggregate uncertainty (deterministic, shallow) ----------
715
911
  def _cf_sign(cf_obj) -> int | None:
716
- neg = (cf_obj.get("uncertainty") or {}).get("negative", None)
912
+ unc = cf_obj.get("uncertainty")
913
+ neg = None if unc is None else unc.get("negative", None)
717
914
  if neg in (0, 1):
718
915
  return -1 if neg == 1 else +1
719
916
  v = cf_obj.get("value")
@@ -721,54 +918,70 @@ def compute_average_cf(
721
918
  return -1 if v < 0 else (+1 if v > 0 else None)
722
919
  return None
723
920
 
724
- cf_signs = [s for (cf, _sh) in matched_cfs if (s := _cf_sign(cf)) is not None]
921
+ cf_signs = [_cf_sign(cf) for (_s, _c, cf) in matched]
922
+ cf_signs = [s for s in cf_signs if s is not None]
725
923
  agg_sign = (
726
924
  cf_signs[0] if (cf_signs and all(s == cf_signs[0] for s in cf_signs)) else None
727
925
  )
728
926
 
729
927
  child_values, child_weights = [], []
730
- for cf, share in matched_cfs:
731
- if share <= 0:
928
+ for (_s, _c, cf), sh in zip(matched, shares):
929
+ if sh <= 0.0:
732
930
  continue
733
- if cf.get("uncertainty") is not None:
734
- u = deepcopy(cf["uncertainty"])
735
- u["negative"] = 0
736
- child_unc = u
931
+ unc = cf.get("uncertainty")
932
+ if unc is not None:
933
+ # Shallow copy of top-level only (keeps nested as-is)
934
+ child_unc = {
935
+ k: (dict(v) if isinstance(v, dict) else v) for k, v in unc.items()
936
+ }
937
+ child_unc["negative"] = 0
737
938
  else:
738
939
  v = cf.get("value")
739
940
  if isinstance(v, (int, float)):
740
941
  child_unc = {
741
942
  "distribution": "discrete_empirical",
742
- "parameters": {"values": [abs(v)], "weights": [1.0]},
943
+ "parameters": {"values": [abs(float(v))], "weights": [1.0]},
743
944
  "negative": 0,
744
945
  }
745
946
  else:
947
+ # symbolic without uncertainty: cannot aggregate deterministically
746
948
  if logger.isEnabledFor(logging.DEBUG):
747
949
  logger.debug(
748
950
  "CF-AVG: skip agg-unc (symbolic child without unc) | child=%s",
749
951
  _short_cf(cf),
750
952
  )
751
- return (expr, None, None)
953
+ return expr, None, None
752
954
  child_values.append(child_unc)
753
- child_weights.append(float(share))
955
+ child_weights.append(float(sh))
754
956
 
755
- wsum = sum(child_weights) or 1.0
756
- child_weights = [w / wsum for w in child_weights]
957
+ if not child_values:
958
+ if logger.isEnabledFor(logging.DEBUG):
959
+ logger.debug("CF-AVG: filtered children empty after cleanup.")
960
+ return 0, None, None
961
+
962
+ w = np.asarray(child_weights, dtype=np.float64)
963
+ w = np.clip(w, 0.0, None)
964
+ wsum = float(w.sum(dtype=np.float64))
965
+ w = (w / wsum) if wsum > 0.0 else np.full_like(w, 1.0 / len(w), dtype=np.float64)
757
966
 
758
- ordering = sorted(
759
- range(len(child_values)),
760
- key=lambda i: json.dumps(child_values[i], sort_keys=True),
967
+ # Deterministic order of child uncertainties via shallow signature only
968
+ order = sorted(
969
+ range(len(child_values)), key=lambda i: _unc_signature(child_values[i])
761
970
  )
762
- child_values = [child_values[i] for i in ordering]
763
- child_weights = [child_weights[i] for i in ordering]
971
+ child_values = [child_values[i] for i in order]
972
+ child_weights = [float(w[i]) for i in order]
764
973
 
974
+ # Final cleanup
765
975
  filtered = [
766
- (v, w) for v, w in zip(child_values, child_weights) if w > 0 and v is not None
976
+ (v, wt)
977
+ for v, wt in zip(child_values, child_weights)
978
+ if wt > 0.0 and v is not None
767
979
  ]
768
980
  if not filtered:
769
981
  if logger.isEnabledFor(logging.DEBUG):
770
- logger.debug("CF-AVG: filtered children empty after cleanup.")
982
+ logger.debug("CF-AVG: filtered children empty after cleanup (post-sort).")
771
983
  return 0, None, None
984
+
772
985
  child_values, child_weights = zip(*filtered)
773
986
 
774
987
  agg_uncertainty = {
@@ -781,11 +994,12 @@ def compute_average_cf(
781
994
  if logger.isEnabledFor(logging.DEBUG):
782
995
  dt = (time.perf_counter() - _t0) if _t0 else None
783
996
  logger.debug(
784
- "CF-AVG: success | children=%d | expr_len=%d | agg_sign=%s | dt=%.3f ms",
997
+ "CF-AVG: success | children=%d | expr_len=%d | agg_sign=%s | dt=%.3f ms | expr=%s",
785
998
  len(child_values),
786
999
  len(expr),
787
1000
  agg_sign,
788
1001
  (dt * 1000.0) if dt else -1.0,
1002
+ expr,
789
1003
  )
790
1004
 
791
- return (expr, None, agg_uncertainty)
1005
+ return expr, None, agg_uncertainty