agift-graph 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agift/__init__.py +47 -0
- agift/__main__.py +5 -0
- agift/cli.py +200 -0
- agift/common.py +210 -0
- agift/embed.py +203 -0
- agift/fetch.py +122 -0
- agift/graph.py +105 -0
- agift/link.py +130 -0
- agift_graph-0.1.0.dist-info/METADATA +154 -0
- agift_graph-0.1.0.dist-info/RECORD +14 -0
- agift_graph-0.1.0.dist-info/WHEEL +5 -0
- agift_graph-0.1.0.dist-info/entry_points.txt +2 -0
- agift_graph-0.1.0.dist-info/licenses/LICENSE +15 -0
- agift_graph-0.1.0.dist-info/top_level.txt +1 -0
agift/__init__.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""AGIFT Graph Builder — modular ETL pipeline for Neo4j."""
|
|
2
|
+
|
|
3
|
+
from agift.common import (
|
|
4
|
+
AGIFT_TOP_TO_DCAT,
|
|
5
|
+
LOCAL_MODELS,
|
|
6
|
+
PROVIDER_ISAACUS,
|
|
7
|
+
PROVIDER_LOCAL,
|
|
8
|
+
SEMANTIC_EDGE_WEIGHT,
|
|
9
|
+
SIMILARITY_THRESHOLD,
|
|
10
|
+
STRUCTURAL_EDGE_WEIGHT,
|
|
11
|
+
TEMATRES_BASE,
|
|
12
|
+
VALID_DIMENSIONS,
|
|
13
|
+
VALID_PROVIDERS,
|
|
14
|
+
get_config_from_neo4j,
|
|
15
|
+
get_neo4j_driver,
|
|
16
|
+
log_run,
|
|
17
|
+
print_summary,
|
|
18
|
+
)
|
|
19
|
+
from agift.fetch import AgiftTerm, fetch_full_hierarchy
|
|
20
|
+
from agift.graph import ensure_schema, upsert_graph
|
|
21
|
+
from agift.embed import build_hierarchical_text, embed_terms, embed_terms_local
|
|
22
|
+
from agift.link import build_semantic_edges
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"AGIFT_TOP_TO_DCAT",
|
|
26
|
+
"AgiftTerm",
|
|
27
|
+
"LOCAL_MODELS",
|
|
28
|
+
"PROVIDER_ISAACUS",
|
|
29
|
+
"PROVIDER_LOCAL",
|
|
30
|
+
"SEMANTIC_EDGE_WEIGHT",
|
|
31
|
+
"SIMILARITY_THRESHOLD",
|
|
32
|
+
"STRUCTURAL_EDGE_WEIGHT",
|
|
33
|
+
"TEMATRES_BASE",
|
|
34
|
+
"VALID_DIMENSIONS",
|
|
35
|
+
"VALID_PROVIDERS",
|
|
36
|
+
"build_hierarchical_text",
|
|
37
|
+
"build_semantic_edges",
|
|
38
|
+
"embed_terms",
|
|
39
|
+
"embed_terms_local",
|
|
40
|
+
"ensure_schema",
|
|
41
|
+
"fetch_full_hierarchy",
|
|
42
|
+
"get_config_from_neo4j",
|
|
43
|
+
"get_neo4j_driver",
|
|
44
|
+
"log_run",
|
|
45
|
+
"print_summary",
|
|
46
|
+
"upsert_graph",
|
|
47
|
+
]
|
agift/__main__.py
ADDED
agift/cli.py
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CLI entry point for AGIFT import pipeline.
|
|
3
|
+
|
|
4
|
+
Usage (via pip install):
|
|
5
|
+
agift
|
|
6
|
+
agift --dry-run
|
|
7
|
+
agift --skip-embed
|
|
8
|
+
agift --force-embed
|
|
9
|
+
agift --skip-semantic
|
|
10
|
+
|
|
11
|
+
Usage (direct):
|
|
12
|
+
python -m agift
|
|
13
|
+
python import_agift.py
|
|
14
|
+
|
|
15
|
+
Source: https://vocabularyserver.com/agift/services.php
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import argparse
|
|
19
|
+
import os
|
|
20
|
+
import sys
|
|
21
|
+
import time
|
|
22
|
+
from datetime import datetime, timezone
|
|
23
|
+
|
|
24
|
+
from agift.common import (
|
|
25
|
+
PROVIDER_ISAACUS,
|
|
26
|
+
PROVIDER_LOCAL,
|
|
27
|
+
TEMATRES_BASE,
|
|
28
|
+
VALID_DIMENSIONS,
|
|
29
|
+
VALID_PROVIDERS,
|
|
30
|
+
get_config_from_neo4j,
|
|
31
|
+
get_neo4j_driver,
|
|
32
|
+
log_run,
|
|
33
|
+
print_summary,
|
|
34
|
+
)
|
|
35
|
+
from agift.fetch import fetch_full_hierarchy
|
|
36
|
+
from agift.graph import ensure_schema, upsert_graph
|
|
37
|
+
from agift.embed import embed_terms, embed_terms_local
|
|
38
|
+
from agift.link import build_semantic_edges
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def main():
|
|
42
|
+
"""CLI entry point for AGIFT import pipeline."""
|
|
43
|
+
parser = argparse.ArgumentParser(
|
|
44
|
+
description="Import AGIFT vocabulary into Neo4j graph with embeddings"
|
|
45
|
+
)
|
|
46
|
+
parser.add_argument("--dry-run", action="store_true",
|
|
47
|
+
help="Fetch from API but don't write to Neo4j")
|
|
48
|
+
parser.add_argument("--skip-alt", action="store_true",
|
|
49
|
+
help="Skip fetching alt labels (faster)")
|
|
50
|
+
parser.add_argument("--skip-embed", action="store_true",
|
|
51
|
+
help="Skip embedding generation")
|
|
52
|
+
parser.add_argument("--force-embed", action="store_true",
|
|
53
|
+
help="Re-embed all terms, not just new/changed")
|
|
54
|
+
parser.add_argument("--skip-semantic", action="store_true",
|
|
55
|
+
help="Skip semantic edge generation")
|
|
56
|
+
parser.add_argument("--provider", choices=list(VALID_PROVIDERS),
|
|
57
|
+
help="Override embedding provider (isaacus or local)")
|
|
58
|
+
parser.add_argument("--dimension", type=int, choices=list(VALID_DIMENSIONS),
|
|
59
|
+
help="Override embedding dimension")
|
|
60
|
+
parser.add_argument("--threshold", type=float, default=None,
|
|
61
|
+
help="Cosine similarity threshold for semantic edges")
|
|
62
|
+
args = parser.parse_args()
|
|
63
|
+
|
|
64
|
+
started_at = datetime.now(timezone.utc).isoformat()
|
|
65
|
+
run_details = {"started_at": started_at}
|
|
66
|
+
|
|
67
|
+
print("=" * 60)
|
|
68
|
+
print("AGIFT Vocabulary Import (Neo4j + Embeddings + Semantic Edges)")
|
|
69
|
+
print("=" * 60)
|
|
70
|
+
print(f"Source: {TEMATRES_BASE}")
|
|
71
|
+
print()
|
|
72
|
+
|
|
73
|
+
# Stage 1: Fetch from TemaTres
|
|
74
|
+
start = time.time()
|
|
75
|
+
terms = fetch_full_hierarchy(include_alts=not args.skip_alt)
|
|
76
|
+
elapsed = time.time() - start
|
|
77
|
+
print(f"\nFetched {len(terms)} terms in {elapsed:.1f}s")
|
|
78
|
+
run_details["fetched"] = len(terms)
|
|
79
|
+
|
|
80
|
+
total_alts = sum(len(t.alt_labels) for t in terms)
|
|
81
|
+
print(f"Total alt labels: {total_alts}")
|
|
82
|
+
|
|
83
|
+
if args.dry_run:
|
|
84
|
+
print("\n[DRY RUN] Skipping Neo4j write and embedding.")
|
|
85
|
+
for t in terms[:10]:
|
|
86
|
+
alts = f" (alts: {', '.join(t.alt_labels)})" if t.alt_labels else ""
|
|
87
|
+
print(f" L{t.depth} [{t.dcat_theme}] {t.label}{alts}")
|
|
88
|
+
print(f" ... and {len(terms) - 10} more")
|
|
89
|
+
return
|
|
90
|
+
|
|
91
|
+
# Connect to Neo4j
|
|
92
|
+
print("\nConnecting to Neo4j...")
|
|
93
|
+
try:
|
|
94
|
+
driver = get_neo4j_driver()
|
|
95
|
+
driver.verify_connectivity()
|
|
96
|
+
except Exception as e:
|
|
97
|
+
print(f"ERROR: Cannot connect to Neo4j: {e}")
|
|
98
|
+
sys.exit(1)
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
# Stage 2: Build graph (structural edges)
|
|
102
|
+
print("\nStage 2: Building graph (structural edges)...")
|
|
103
|
+
ensure_schema(driver)
|
|
104
|
+
graph_stats = upsert_graph(driver, terms)
|
|
105
|
+
print(f" Created: {graph_stats['created']}")
|
|
106
|
+
print(f" Updated: {graph_stats['updated']}")
|
|
107
|
+
print(f" Unchanged: {graph_stats['unchanged']}")
|
|
108
|
+
run_details.update({
|
|
109
|
+
"created": graph_stats["created"],
|
|
110
|
+
"updated": graph_stats["updated"],
|
|
111
|
+
"unchanged": graph_stats["unchanged"],
|
|
112
|
+
})
|
|
113
|
+
|
|
114
|
+
# Stage 3: Embed
|
|
115
|
+
if args.skip_embed:
|
|
116
|
+
print("\nStage 3: Skipped (--skip-embed)")
|
|
117
|
+
run_details["embedded"] = 0
|
|
118
|
+
run_details["embed_failed"] = 0
|
|
119
|
+
run_details["embedding_provider"] = ""
|
|
120
|
+
else:
|
|
121
|
+
config = get_config_from_neo4j(driver)
|
|
122
|
+
provider = args.provider or config["embedding_provider"]
|
|
123
|
+
dimension = args.dimension or config["embedding_dimension"]
|
|
124
|
+
run_details["embedding_provider"] = provider
|
|
125
|
+
|
|
126
|
+
# Determine which term IDs to embed
|
|
127
|
+
if args.force_embed:
|
|
128
|
+
with driver.session() as session:
|
|
129
|
+
result = session.run(
|
|
130
|
+
"MATCH (t:Term) RETURN t.term_id AS tid"
|
|
131
|
+
)
|
|
132
|
+
embed_ids = [r["tid"] for r in result]
|
|
133
|
+
else:
|
|
134
|
+
embed_ids = graph_stats["changed_ids"]
|
|
135
|
+
|
|
136
|
+
if not embed_ids:
|
|
137
|
+
print("\nStage 3: No terms need embedding")
|
|
138
|
+
run_details["embedded"] = 0
|
|
139
|
+
run_details["embed_failed"] = 0
|
|
140
|
+
elif provider == PROVIDER_LOCAL:
|
|
141
|
+
print(f"\nStage 3: Local embedding {len(embed_ids)} terms "
|
|
142
|
+
f"(dimension={dimension})...")
|
|
143
|
+
embed_stats = embed_terms_local(driver, embed_ids, dimension)
|
|
144
|
+
print(f" Embedded: {embed_stats['embedded']}")
|
|
145
|
+
print(f" Failed: {embed_stats['failed']}")
|
|
146
|
+
run_details["embedded"] = embed_stats["embedded"]
|
|
147
|
+
run_details["embed_failed"] = embed_stats["failed"]
|
|
148
|
+
else:
|
|
149
|
+
# Isaacus provider
|
|
150
|
+
api_key = config["isaacus_api_key"] or os.environ.get("ISAACUS_API_KEY")
|
|
151
|
+
if not api_key:
|
|
152
|
+
print("\nStage 3: Skipped (no Isaacus API key configured)")
|
|
153
|
+
print(" Set via dashboard, or use --provider local")
|
|
154
|
+
run_details["embedded"] = 0
|
|
155
|
+
run_details["embed_failed"] = 0
|
|
156
|
+
else:
|
|
157
|
+
print(f"\nStage 3: Isaacus embedding {len(embed_ids)} terms "
|
|
158
|
+
f"(dimension={dimension})...")
|
|
159
|
+
embed_stats = embed_terms(driver, embed_ids, api_key, dimension)
|
|
160
|
+
print(f" Embedded: {embed_stats['embedded']}")
|
|
161
|
+
print(f" Failed: {embed_stats['failed']}")
|
|
162
|
+
run_details["embedded"] = embed_stats["embedded"]
|
|
163
|
+
run_details["embed_failed"] = embed_stats["failed"]
|
|
164
|
+
|
|
165
|
+
# Stage 4: Semantic edges
|
|
166
|
+
if args.skip_semantic:
|
|
167
|
+
print("\nStage 4: Skipped (--skip-semantic)")
|
|
168
|
+
run_details["semantic_edges_created"] = 0
|
|
169
|
+
else:
|
|
170
|
+
config = get_config_from_neo4j(driver)
|
|
171
|
+
threshold = args.threshold or config["similarity_threshold"]
|
|
172
|
+
sem_weight = config["semantic_edge_weight"]
|
|
173
|
+
|
|
174
|
+
print(f"\nStage 4: Building semantic edges "
|
|
175
|
+
f"(threshold={threshold}, weight={sem_weight})...")
|
|
176
|
+
sem_stats = build_semantic_edges(driver, threshold, sem_weight)
|
|
177
|
+
print(f" Created: {sem_stats['created']}")
|
|
178
|
+
print(f" Skipped (structural): {sem_stats['skipped_structural']}")
|
|
179
|
+
print(f" Below threshold: {sem_stats['below_threshold']}")
|
|
180
|
+
run_details["semantic_edges_created"] = sem_stats["created"]
|
|
181
|
+
|
|
182
|
+
print_summary(driver)
|
|
183
|
+
log_run(driver, "success", run_details)
|
|
184
|
+
|
|
185
|
+
except Exception as e:
|
|
186
|
+
print(f"\nERROR: {e}")
|
|
187
|
+
run_details["error"] = str(e)
|
|
188
|
+
try:
|
|
189
|
+
log_run(driver, "error", run_details)
|
|
190
|
+
except Exception:
|
|
191
|
+
pass
|
|
192
|
+
sys.exit(1)
|
|
193
|
+
finally:
|
|
194
|
+
driver.close()
|
|
195
|
+
|
|
196
|
+
print("\nDone.")
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
if __name__ == "__main__":
|
|
200
|
+
main()
|
agift/common.py
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
"""Shared constants, Neo4j helpers, and run logging."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
|
|
6
|
+
from neo4j import GraphDatabase
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
# ---------------------------------------------------------------------------
|
|
10
|
+
# AGIFT top-level function → DCAT-AP theme mapping
|
|
11
|
+
# ---------------------------------------------------------------------------
|
|
12
|
+
AGIFT_TOP_TO_DCAT: dict[str, str] = {
|
|
13
|
+
"business support and regulation": "ENTR",
|
|
14
|
+
"civic infraestructure": "REGI", # sic — typo in AGIFT source
|
|
15
|
+
"communications": "INTE",
|
|
16
|
+
"community services": "SOCI",
|
|
17
|
+
"cultural affairs": "CULT",
|
|
18
|
+
"defence": "JUST",
|
|
19
|
+
"education and training": "EDUC",
|
|
20
|
+
"employment": "ECON",
|
|
21
|
+
"environment": "ENVI",
|
|
22
|
+
"finance management": "ECON",
|
|
23
|
+
"governance": "GOVE",
|
|
24
|
+
"health care": "HEAL",
|
|
25
|
+
"immigration": "MIGR",
|
|
26
|
+
"indigenous affairs": "SOCI",
|
|
27
|
+
"international relations": "INTR",
|
|
28
|
+
"justice administration": "JUST",
|
|
29
|
+
"maritime services": "TRAN",
|
|
30
|
+
"natural resources": "ENVI",
|
|
31
|
+
"primary industries": "AGRI",
|
|
32
|
+
"science": "TECH",
|
|
33
|
+
"security": "JUST",
|
|
34
|
+
"sport and recreation": "CULT",
|
|
35
|
+
"statistical services": "GOVE",
|
|
36
|
+
"tourism": "CULT",
|
|
37
|
+
"trade": "ECON",
|
|
38
|
+
"transport": "TRAN",
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
TEMATRES_BASE = "https://vocabularyserver.com/agift/services.php"
|
|
42
|
+
VALID_DIMENSIONS = (256, 384, 512, 768, 1024, 1792)
|
|
43
|
+
|
|
44
|
+
# Embedding provider constants
|
|
45
|
+
PROVIDER_ISAACUS = "isaacus"
|
|
46
|
+
PROVIDER_LOCAL = "local"
|
|
47
|
+
VALID_PROVIDERS = (PROVIDER_ISAACUS, PROVIDER_LOCAL)
|
|
48
|
+
|
|
49
|
+
# Local sentence-transformers model map: dimension → model name
|
|
50
|
+
LOCAL_MODELS: dict[int, str] = {
|
|
51
|
+
384: "all-MiniLM-L6-v2",
|
|
52
|
+
768: "all-mpnet-base-v2",
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
# Semantic edge defaults
|
|
56
|
+
SIMILARITY_THRESHOLD = 0.70
|
|
57
|
+
SEMANTIC_EDGE_WEIGHT = 0.5
|
|
58
|
+
STRUCTURAL_EDGE_WEIGHT = 1.0
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# ---------------------------------------------------------------------------
|
|
62
|
+
# Neo4j connection helpers
|
|
63
|
+
# ---------------------------------------------------------------------------
|
|
64
|
+
|
|
65
|
+
def get_neo4j_driver():
|
|
66
|
+
"""Create Neo4j driver from environment variables.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
Neo4j driver instance.
|
|
70
|
+
|
|
71
|
+
Raises:
|
|
72
|
+
RuntimeError: If connection fails.
|
|
73
|
+
"""
|
|
74
|
+
uri = os.environ.get("NEO4J_URI", "bolt://localhost:7687")
|
|
75
|
+
user = os.environ.get("NEO4J_USER", "neo4j")
|
|
76
|
+
password = os.environ.get("NEO4J_PASSWORD", "changeme")
|
|
77
|
+
return GraphDatabase.driver(uri, auth=(user, password))
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def get_config_from_neo4j(driver) -> dict:
|
|
81
|
+
"""Read dashboard config (API key, dimension, provider) from Neo4j.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
driver: Neo4j driver.
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
Dict with isaacus_api_key, embedding_dimension, embedding_provider,
|
|
88
|
+
similarity_threshold, and semantic_edge_weight.
|
|
89
|
+
"""
|
|
90
|
+
with driver.session() as session:
|
|
91
|
+
result = session.run(
|
|
92
|
+
"MATCH (c:Config {name: 'agift'}) "
|
|
93
|
+
"RETURN c.isaacus_api_key AS key, "
|
|
94
|
+
" c.embedding_dimension AS dim, "
|
|
95
|
+
" c.embedding_provider AS provider, "
|
|
96
|
+
" c.similarity_threshold AS sim_thresh, "
|
|
97
|
+
" c.semantic_edge_weight AS sem_weight"
|
|
98
|
+
)
|
|
99
|
+
record = result.single()
|
|
100
|
+
if record:
|
|
101
|
+
return {
|
|
102
|
+
"isaacus_api_key": record["key"],
|
|
103
|
+
"embedding_dimension": record["dim"] or 512,
|
|
104
|
+
"embedding_provider": record["provider"] or PROVIDER_ISAACUS,
|
|
105
|
+
"similarity_threshold": record["sim_thresh"] or SIMILARITY_THRESHOLD,
|
|
106
|
+
"semantic_edge_weight": record["sem_weight"] or SEMANTIC_EDGE_WEIGHT,
|
|
107
|
+
}
|
|
108
|
+
return {
|
|
109
|
+
"isaacus_api_key": None,
|
|
110
|
+
"embedding_dimension": 512,
|
|
111
|
+
"embedding_provider": PROVIDER_ISAACUS,
|
|
112
|
+
"similarity_threshold": SIMILARITY_THRESHOLD,
|
|
113
|
+
"semantic_edge_weight": SEMANTIC_EDGE_WEIGHT,
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# ---------------------------------------------------------------------------
|
|
118
|
+
# Run logging
|
|
119
|
+
# ---------------------------------------------------------------------------
|
|
120
|
+
|
|
121
|
+
def log_run(driver, status: str, details: dict) -> None:
|
|
122
|
+
"""Write a run log node to Neo4j for dashboard display.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
driver: Neo4j driver.
|
|
126
|
+
status: "success" or "error".
|
|
127
|
+
details: Dict of run stats to store.
|
|
128
|
+
"""
|
|
129
|
+
with driver.session() as session:
|
|
130
|
+
session.run(
|
|
131
|
+
"""
|
|
132
|
+
CREATE (r:RunLog {
|
|
133
|
+
worker: 'agift',
|
|
134
|
+
status: $status,
|
|
135
|
+
started_at: datetime($started),
|
|
136
|
+
finished_at: datetime(),
|
|
137
|
+
terms_fetched: $fetched,
|
|
138
|
+
terms_created: $created,
|
|
139
|
+
terms_updated: $updated,
|
|
140
|
+
terms_unchanged: $unchanged,
|
|
141
|
+
terms_embedded: $embedded,
|
|
142
|
+
terms_embed_failed: $embed_failed,
|
|
143
|
+
embedding_provider: $provider,
|
|
144
|
+
semantic_edges_created: $sem_created,
|
|
145
|
+
error_message: $error
|
|
146
|
+
})
|
|
147
|
+
""",
|
|
148
|
+
status=status,
|
|
149
|
+
started=details.get("started_at", datetime.now(timezone.utc).isoformat()),
|
|
150
|
+
fetched=details.get("fetched", 0),
|
|
151
|
+
created=details.get("created", 0),
|
|
152
|
+
updated=details.get("updated", 0),
|
|
153
|
+
unchanged=details.get("unchanged", 0),
|
|
154
|
+
embedded=details.get("embedded", 0),
|
|
155
|
+
embed_failed=details.get("embed_failed", 0),
|
|
156
|
+
provider=details.get("embedding_provider", ""),
|
|
157
|
+
sem_created=details.get("semantic_edges_created", 0),
|
|
158
|
+
error=details.get("error", ""),
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# Prune old logs — keep last 20
|
|
162
|
+
session.run(
|
|
163
|
+
"""
|
|
164
|
+
MATCH (r:RunLog {worker: 'agift'})
|
|
165
|
+
WITH r ORDER BY r.finished_at DESC
|
|
166
|
+
SKIP 20
|
|
167
|
+
DELETE r
|
|
168
|
+
"""
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def print_summary(driver) -> None:
|
|
173
|
+
"""Print a summary of the Neo4j graph state."""
|
|
174
|
+
with driver.session() as session:
|
|
175
|
+
result = session.run("MATCH (t:Term) RETURN count(t) AS total")
|
|
176
|
+
total = result.single()["total"]
|
|
177
|
+
|
|
178
|
+
result = session.run(
|
|
179
|
+
"MATCH (t:Term) WHERE t.embedding IS NOT NULL RETURN count(t) AS embedded"
|
|
180
|
+
)
|
|
181
|
+
embedded = result.single()["embedded"]
|
|
182
|
+
|
|
183
|
+
result = session.run(
|
|
184
|
+
"MATCH (t:Term) RETURN t.depth AS depth, count(t) AS cnt ORDER BY depth"
|
|
185
|
+
)
|
|
186
|
+
by_depth = [(r["depth"], r["cnt"]) for r in result]
|
|
187
|
+
|
|
188
|
+
result = session.run(
|
|
189
|
+
"MATCH (t:Term) RETURN t.dcat_theme AS theme, count(t) AS cnt "
|
|
190
|
+
"ORDER BY cnt DESC"
|
|
191
|
+
)
|
|
192
|
+
by_theme = [(r["theme"], r["cnt"]) for r in result]
|
|
193
|
+
|
|
194
|
+
result = session.run("MATCH ()-[r:PARENT_OF]->() RETURN count(r) AS edges")
|
|
195
|
+
structural_edges = result.single()["edges"]
|
|
196
|
+
|
|
197
|
+
result = session.run("MATCH ()-[r:SIMILAR_TO]->() RETURN count(r) AS edges")
|
|
198
|
+
semantic_edges = result.single()["edges"]
|
|
199
|
+
|
|
200
|
+
print(f"\n=== AGIFT Graph Summary ===")
|
|
201
|
+
print(f"Total terms: {total}")
|
|
202
|
+
print(f"Embedded: {embedded}")
|
|
203
|
+
print(f"Structural edges: {structural_edges} (PARENT_OF, weight=1.0)")
|
|
204
|
+
print(f"Semantic edges: {semantic_edges} (SIMILAR_TO, weight=0.5)")
|
|
205
|
+
print(f"\nBy depth:")
|
|
206
|
+
for depth, cnt in by_depth:
|
|
207
|
+
print(f" L{depth}: {cnt}")
|
|
208
|
+
print(f"\nBy DCAT-AP theme:")
|
|
209
|
+
for theme, cnt in by_theme:
|
|
210
|
+
print(f" {theme}: {cnt}")
|
agift/embed.py
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
"""Stage 3: Embed — generate vector embeddings for AGIFT terms."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
from agift.common import LOCAL_MODELS
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def build_hierarchical_text(driver, term_id: int) -> str:
|
|
9
|
+
"""Build embedding input text using the term's full hierarchy path.
|
|
10
|
+
|
|
11
|
+
For a L3 term like "Water quality monitoring", produces:
|
|
12
|
+
"Environment > Water resources management > Water quality monitoring"
|
|
13
|
+
|
|
14
|
+
Also appends alt labels for richer semantic coverage.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
driver: Neo4j driver.
|
|
18
|
+
term_id: The term to build text for.
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
Hierarchical context string for embedding.
|
|
22
|
+
"""
|
|
23
|
+
with driver.session() as session:
|
|
24
|
+
# Walk up the tree to build the path
|
|
25
|
+
result = session.run(
|
|
26
|
+
"""
|
|
27
|
+
MATCH path = (root:Term)-[:PARENT_OF*0..2]->(t:Term {term_id: $tid})
|
|
28
|
+
WHERE NOT ()-[:PARENT_OF]->(root)
|
|
29
|
+
RETURN [n IN nodes(path) | n.label] AS chain,
|
|
30
|
+
t.alt_labels AS alts
|
|
31
|
+
""",
|
|
32
|
+
tid=term_id,
|
|
33
|
+
)
|
|
34
|
+
record = result.single()
|
|
35
|
+
if not record:
|
|
36
|
+
# Fallback: just the term itself (shouldn't happen)
|
|
37
|
+
result2 = session.run(
|
|
38
|
+
"MATCH (t:Term {term_id: $tid}) RETURN t.label AS label, t.alt_labels AS alts",
|
|
39
|
+
tid=term_id,
|
|
40
|
+
)
|
|
41
|
+
r2 = result2.single()
|
|
42
|
+
return r2["label"] if r2 else ""
|
|
43
|
+
|
|
44
|
+
chain = record["chain"]
|
|
45
|
+
alts = record["alts"] or []
|
|
46
|
+
|
|
47
|
+
text = " > ".join(chain)
|
|
48
|
+
if alts:
|
|
49
|
+
text += f" (also known as: {', '.join(alts)})"
|
|
50
|
+
return text
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def embed_terms(driver, term_ids: list[int], api_key: str, dimension: int) -> dict:
|
|
54
|
+
"""Generate and store Isaacus embeddings for AGIFT terms.
|
|
55
|
+
|
|
56
|
+
Only embeds terms in the provided list (new or changed terms).
|
|
57
|
+
Batches API calls for efficiency.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
driver: Neo4j driver.
|
|
61
|
+
term_ids: List of term_ids to embed.
|
|
62
|
+
api_key: Isaacus API key.
|
|
63
|
+
dimension: Embedding dimension.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
Dict with embedded and failed counts.
|
|
67
|
+
"""
|
|
68
|
+
try:
|
|
69
|
+
from isaacus import Isaacus
|
|
70
|
+
except ImportError:
|
|
71
|
+
raise ImportError(
|
|
72
|
+
"The isaacus package is required for Isaacus embeddings.\n"
|
|
73
|
+
"Install it with: pip install agift-graph[isaacus]"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
client = Isaacus(api_key=api_key)
|
|
77
|
+
stats = {"embedded": 0, "failed": 0}
|
|
78
|
+
batch_size = 50
|
|
79
|
+
|
|
80
|
+
for i in range(0, len(term_ids), batch_size):
|
|
81
|
+
batch_ids = term_ids[i:i + batch_size]
|
|
82
|
+
texts = []
|
|
83
|
+
valid_ids = []
|
|
84
|
+
|
|
85
|
+
for tid in batch_ids:
|
|
86
|
+
text = build_hierarchical_text(driver, tid)
|
|
87
|
+
if text:
|
|
88
|
+
texts.append(text)
|
|
89
|
+
valid_ids.append(tid)
|
|
90
|
+
|
|
91
|
+
if not texts:
|
|
92
|
+
continue
|
|
93
|
+
|
|
94
|
+
try:
|
|
95
|
+
response = client.embeddings.create(
|
|
96
|
+
model="kanon-2-embedder",
|
|
97
|
+
texts=texts,
|
|
98
|
+
dimensions=dimension,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
with driver.session() as session:
|
|
102
|
+
for j, embedding_data in enumerate(response.embeddings):
|
|
103
|
+
session.run(
|
|
104
|
+
"""
|
|
105
|
+
MATCH (t:Term {term_id: $tid})
|
|
106
|
+
SET t.embedding = $embedding,
|
|
107
|
+
t.embedding_dimension = $dim,
|
|
108
|
+
t.embedding_provider = 'isaacus',
|
|
109
|
+
t.embedded_at = datetime()
|
|
110
|
+
""",
|
|
111
|
+
tid=valid_ids[j],
|
|
112
|
+
embedding=embedding_data.embedding,
|
|
113
|
+
dim=dimension,
|
|
114
|
+
)
|
|
115
|
+
stats["embedded"] += 1
|
|
116
|
+
|
|
117
|
+
print(f" Embedded batch {i // batch_size + 1}: {len(texts)} terms")
|
|
118
|
+
|
|
119
|
+
except Exception as e:
|
|
120
|
+
print(f" Embedding batch failed: {e}")
|
|
121
|
+
stats["failed"] += len(texts)
|
|
122
|
+
|
|
123
|
+
return stats
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def embed_terms_local(driver, term_ids: list[int], dimension: int) -> dict:
|
|
127
|
+
"""Generate and store embeddings using local sentence-transformers.
|
|
128
|
+
|
|
129
|
+
Uses CPU-only inference. Model is selected based on dimension:
|
|
130
|
+
384 → all-MiniLM-L6-v2
|
|
131
|
+
768 → all-mpnet-base-v2
|
|
132
|
+
|
|
133
|
+
Models are cached in /app/models (Docker) or ~/.cache/huggingface.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
driver: Neo4j driver.
|
|
137
|
+
term_ids: List of term_ids to embed.
|
|
138
|
+
dimension: Embedding dimension (384 or 768).
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
Dict with embedded and failed counts.
|
|
142
|
+
"""
|
|
143
|
+
try:
|
|
144
|
+
from sentence_transformers import SentenceTransformer
|
|
145
|
+
except ImportError:
|
|
146
|
+
raise ImportError(
|
|
147
|
+
"The sentence-transformers package is required for local embeddings.\n"
|
|
148
|
+
"Install it with: pip install agift-graph[local]"
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
model_name = LOCAL_MODELS.get(dimension)
|
|
152
|
+
if not model_name:
|
|
153
|
+
print(f" ERROR: No local model for dimension {dimension}. "
|
|
154
|
+
f"Use one of: {list(LOCAL_MODELS.keys())}")
|
|
155
|
+
return {"embedded": 0, "failed": len(term_ids)}
|
|
156
|
+
|
|
157
|
+
cache_dir = os.environ.get("TRANSFORMERS_CACHE", None)
|
|
158
|
+
print(f" Loading model: {model_name} (dimension={dimension})...")
|
|
159
|
+
model = SentenceTransformer(model_name, cache_folder=cache_dir)
|
|
160
|
+
|
|
161
|
+
stats = {"embedded": 0, "failed": 0}
|
|
162
|
+
batch_size = 64
|
|
163
|
+
|
|
164
|
+
for i in range(0, len(term_ids), batch_size):
|
|
165
|
+
batch_ids = term_ids[i:i + batch_size]
|
|
166
|
+
texts = []
|
|
167
|
+
valid_ids = []
|
|
168
|
+
|
|
169
|
+
for tid in batch_ids:
|
|
170
|
+
text = build_hierarchical_text(driver, tid)
|
|
171
|
+
if text:
|
|
172
|
+
texts.append(text)
|
|
173
|
+
valid_ids.append(tid)
|
|
174
|
+
|
|
175
|
+
if not texts:
|
|
176
|
+
continue
|
|
177
|
+
|
|
178
|
+
try:
|
|
179
|
+
embeddings = model.encode(texts, show_progress_bar=False)
|
|
180
|
+
|
|
181
|
+
with driver.session() as session:
|
|
182
|
+
for j, vec in enumerate(embeddings):
|
|
183
|
+
session.run(
|
|
184
|
+
"""
|
|
185
|
+
MATCH (t:Term {term_id: $tid})
|
|
186
|
+
SET t.embedding = $embedding,
|
|
187
|
+
t.embedding_dimension = $dim,
|
|
188
|
+
t.embedding_provider = 'local',
|
|
189
|
+
t.embedded_at = datetime()
|
|
190
|
+
""",
|
|
191
|
+
tid=valid_ids[j],
|
|
192
|
+
embedding=vec.tolist(),
|
|
193
|
+
dim=dimension,
|
|
194
|
+
)
|
|
195
|
+
stats["embedded"] += 1
|
|
196
|
+
|
|
197
|
+
print(f" Embedded batch {i // batch_size + 1}: {len(texts)} terms")
|
|
198
|
+
|
|
199
|
+
except Exception as e:
|
|
200
|
+
print(f" Local embedding batch failed: {e}")
|
|
201
|
+
stats["failed"] += len(texts)
|
|
202
|
+
|
|
203
|
+
return stats
|
agift/fetch.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
"""Stage 1: Fetch — pull full AGIFT hierarchy from TemaTres API."""
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
import xml.etree.ElementTree as ET
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from urllib.request import Request, urlopen
|
|
7
|
+
from urllib.error import URLError
|
|
8
|
+
|
|
9
|
+
from agift.common import AGIFT_TOP_TO_DCAT, TEMATRES_BASE
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class AgiftTerm:
|
|
14
|
+
"""A single AGIFT vocabulary term."""
|
|
15
|
+
term_id: int
|
|
16
|
+
label: str
|
|
17
|
+
parent_id: int | None
|
|
18
|
+
top_level_id: int | None
|
|
19
|
+
depth: int
|
|
20
|
+
dcat_theme: str
|
|
21
|
+
alt_labels: list[str] = field(default_factory=list)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _fetch_xml(task: str, arg: str = "") -> ET.Element:
|
|
25
|
+
"""Fetch XML from TemaTres API with retry."""
|
|
26
|
+
url = f"{TEMATRES_BASE}?task={task}"
|
|
27
|
+
if arg:
|
|
28
|
+
url += f"&arg={arg}"
|
|
29
|
+
for attempt in range(3):
|
|
30
|
+
try:
|
|
31
|
+
req = Request(url, headers={"User-Agent": "AGIFT-Graph-Import/1.0"})
|
|
32
|
+
with urlopen(req, timeout=120) as resp:
|
|
33
|
+
data = resp.read().decode("utf-8")
|
|
34
|
+
return ET.fromstring(data)
|
|
35
|
+
except (URLError, TimeoutError, ET.ParseError) as e:
|
|
36
|
+
if attempt == 2:
|
|
37
|
+
raise
|
|
38
|
+
wait = 5 * (attempt + 1)
|
|
39
|
+
print(f" Retry {attempt + 1} for {task} {arg}: {e}")
|
|
40
|
+
time.sleep(wait)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _parse_terms(root: ET.Element) -> list[tuple[int, str]]:
|
|
44
|
+
"""Extract (term_id, label) pairs from a TemaTres XML response."""
|
|
45
|
+
results = []
|
|
46
|
+
for term in root.findall(".//term"):
|
|
47
|
+
tid_el = term.find("term_id")
|
|
48
|
+
str_el = term.find("string")
|
|
49
|
+
if tid_el is not None and str_el is not None and tid_el.text and str_el.text:
|
|
50
|
+
results.append((int(tid_el.text), str_el.text.strip()))
|
|
51
|
+
return results
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _fetch_alt_labels(term_id: int) -> list[str]:
|
|
55
|
+
"""Fetch non-preferred (alternative) labels for a term."""
|
|
56
|
+
try:
|
|
57
|
+
root = _fetch_xml("fetchAlt", str(term_id))
|
|
58
|
+
return [label for _, label in _parse_terms(root)]
|
|
59
|
+
except Exception:
|
|
60
|
+
return []
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def fetch_full_hierarchy(include_alts: bool = True) -> list[AgiftTerm]:
|
|
64
|
+
"""Walk the full AGIFT hierarchy from TemaTres and return all terms.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
include_alts: If True, fetch alt labels for each term (slower).
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
List of AgiftTerm objects for the full 3-level hierarchy.
|
|
71
|
+
"""
|
|
72
|
+
print("Fetching AGIFT top-level terms...")
|
|
73
|
+
top_root = _fetch_xml("fetchTopTerms")
|
|
74
|
+
top_terms = _parse_terms(top_root)
|
|
75
|
+
print(f" Found {len(top_terms)} top-level functions")
|
|
76
|
+
if not include_alts:
|
|
77
|
+
print(" (skipping alt labels)")
|
|
78
|
+
|
|
79
|
+
all_terms: list[AgiftTerm] = []
|
|
80
|
+
|
|
81
|
+
for top_id, top_label in top_terms:
|
|
82
|
+
dcat = AGIFT_TOP_TO_DCAT.get(top_label.lower())
|
|
83
|
+
if not dcat:
|
|
84
|
+
print(f" WARNING: No DCAT mapping for top-level '{top_label}', using GOVE")
|
|
85
|
+
dcat = "GOVE"
|
|
86
|
+
|
|
87
|
+
alt_labels = _fetch_alt_labels(top_id) if include_alts else []
|
|
88
|
+
all_terms.append(AgiftTerm(
|
|
89
|
+
term_id=top_id, label=top_label, parent_id=None,
|
|
90
|
+
top_level_id=top_id, depth=1, dcat_theme=dcat,
|
|
91
|
+
alt_labels=alt_labels,
|
|
92
|
+
))
|
|
93
|
+
|
|
94
|
+
# Level 2
|
|
95
|
+
l2_root = _fetch_xml("fetchDown", str(top_id))
|
|
96
|
+
l2_terms = _parse_terms(l2_root)
|
|
97
|
+
print(f" {top_label} ({dcat}): {len(l2_terms)} L2 terms")
|
|
98
|
+
|
|
99
|
+
for l2_id, l2_label in l2_terms:
|
|
100
|
+
alt_labels = _fetch_alt_labels(l2_id) if include_alts else []
|
|
101
|
+
all_terms.append(AgiftTerm(
|
|
102
|
+
term_id=l2_id, label=l2_label, parent_id=top_id,
|
|
103
|
+
top_level_id=top_id, depth=2, dcat_theme=dcat,
|
|
104
|
+
alt_labels=alt_labels,
|
|
105
|
+
))
|
|
106
|
+
|
|
107
|
+
# Level 3
|
|
108
|
+
l3_root = _fetch_xml("fetchDown", str(l2_id))
|
|
109
|
+
l3_terms = _parse_terms(l3_root)
|
|
110
|
+
|
|
111
|
+
for l3_id, l3_label in l3_terms:
|
|
112
|
+
alt_labels = _fetch_alt_labels(l3_id) if include_alts else []
|
|
113
|
+
all_terms.append(AgiftTerm(
|
|
114
|
+
term_id=l3_id, label=l3_label, parent_id=l2_id,
|
|
115
|
+
top_level_id=top_id, depth=3, dcat_theme=dcat,
|
|
116
|
+
alt_labels=alt_labels,
|
|
117
|
+
))
|
|
118
|
+
|
|
119
|
+
# Be polite to the API
|
|
120
|
+
time.sleep(2)
|
|
121
|
+
|
|
122
|
+
return all_terms
|
agift/graph.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""Stage 2: Graph — upsert AGIFT terms and structural edges into Neo4j."""
|
|
2
|
+
|
|
3
|
+
from agift.fetch import AgiftTerm
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def ensure_schema(driver) -> None:
|
|
7
|
+
"""Create Neo4j constraints and indexes.
|
|
8
|
+
|
|
9
|
+
Args:
|
|
10
|
+
driver: Neo4j driver.
|
|
11
|
+
"""
|
|
12
|
+
with driver.session() as session:
|
|
13
|
+
session.run(
|
|
14
|
+
"CREATE CONSTRAINT term_id_unique IF NOT EXISTS "
|
|
15
|
+
"FOR (t:Term) REQUIRE t.term_id IS UNIQUE"
|
|
16
|
+
)
|
|
17
|
+
session.run(
|
|
18
|
+
"CREATE INDEX term_dcat IF NOT EXISTS "
|
|
19
|
+
"FOR (t:Term) ON (t.dcat_theme)"
|
|
20
|
+
)
|
|
21
|
+
session.run(
|
|
22
|
+
"CREATE INDEX term_depth IF NOT EXISTS "
|
|
23
|
+
"FOR (t:Term) ON (t.depth)"
|
|
24
|
+
)
|
|
25
|
+
print("Neo4j schema ensured.")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def upsert_graph(driver, terms: list[AgiftTerm]) -> dict:
|
|
29
|
+
"""Upsert AGIFT terms as Neo4j nodes with PARENT_OF edges.
|
|
30
|
+
|
|
31
|
+
Uses MERGE to avoid duplicates. Detects changed labels to flag
|
|
32
|
+
terms that need re-embedding.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
driver: Neo4j driver.
|
|
36
|
+
terms: List of AgiftTerm objects.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Dict with created, updated, unchanged counts and
|
|
40
|
+
changed_ids list of term_ids that need re-embedding.
|
|
41
|
+
"""
|
|
42
|
+
# Sort so parents come before children
|
|
43
|
+
terms_sorted = sorted(terms, key=lambda t: t.depth)
|
|
44
|
+
|
|
45
|
+
stats = {"created": 0, "updated": 0, "unchanged": 0, "changed_ids": []}
|
|
46
|
+
|
|
47
|
+
with driver.session() as session:
|
|
48
|
+
for t in terms_sorted:
|
|
49
|
+
result = session.run(
|
|
50
|
+
"""
|
|
51
|
+
MERGE (t:Term {term_id: $term_id})
|
|
52
|
+
ON CREATE SET
|
|
53
|
+
t.label = $label,
|
|
54
|
+
t.label_norm = $label_norm,
|
|
55
|
+
t.depth = $depth,
|
|
56
|
+
t.dcat_theme = $dcat_theme,
|
|
57
|
+
t.top_level_id = $top_level_id,
|
|
58
|
+
t.alt_labels = $alt_labels,
|
|
59
|
+
t.created_at = datetime(),
|
|
60
|
+
t._changed = true
|
|
61
|
+
ON MATCH SET
|
|
62
|
+
t._changed = (t.label <> $label OR t.alt_labels <> $alt_labels),
|
|
63
|
+
t.label = $label,
|
|
64
|
+
t.label_norm = $label_norm,
|
|
65
|
+
t.depth = $depth,
|
|
66
|
+
t.dcat_theme = $dcat_theme,
|
|
67
|
+
t.top_level_id = $top_level_id,
|
|
68
|
+
t.alt_labels = $alt_labels
|
|
69
|
+
RETURN t._changed AS changed, t.embedding IS NULL AS no_embed
|
|
70
|
+
""",
|
|
71
|
+
term_id=t.term_id,
|
|
72
|
+
label=t.label,
|
|
73
|
+
label_norm=t.label.lower().strip(),
|
|
74
|
+
depth=t.depth,
|
|
75
|
+
dcat_theme=t.dcat_theme,
|
|
76
|
+
top_level_id=t.top_level_id,
|
|
77
|
+
alt_labels=t.alt_labels,
|
|
78
|
+
)
|
|
79
|
+
record = result.single()
|
|
80
|
+
if record["changed"] or record["no_embed"]:
|
|
81
|
+
stats["changed_ids"].append(t.term_id)
|
|
82
|
+
if record["no_embed"]:
|
|
83
|
+
stats["created"] += 1
|
|
84
|
+
else:
|
|
85
|
+
stats["updated"] += 1
|
|
86
|
+
else:
|
|
87
|
+
stats["unchanged"] += 1
|
|
88
|
+
|
|
89
|
+
# Create PARENT_OF edge
|
|
90
|
+
if t.parent_id is not None:
|
|
91
|
+
session.run(
|
|
92
|
+
"""
|
|
93
|
+
MATCH (parent:Term {term_id: $parent_id})
|
|
94
|
+
MATCH (child:Term {term_id: $child_id})
|
|
95
|
+
MERGE (parent)-[:PARENT_OF]->(child)
|
|
96
|
+
""",
|
|
97
|
+
parent_id=t.parent_id,
|
|
98
|
+
child_id=t.term_id,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
# Clean up _changed flag
|
|
102
|
+
with driver.session() as session:
|
|
103
|
+
session.run("MATCH (t:Term) REMOVE t._changed")
|
|
104
|
+
|
|
105
|
+
return stats
|
agift/link.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""Stage 4: Link — build semantic similarity edges from embeddings."""
|
|
2
|
+
|
|
3
|
+
from agift.common import SEMANTIC_EDGE_WEIGHT, SIMILARITY_THRESHOLD
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _cosine_similarity(a: list[float], b: list[float]) -> float:
|
|
7
|
+
"""Compute cosine similarity between two vectors.
|
|
8
|
+
|
|
9
|
+
Args:
|
|
10
|
+
a: First embedding vector.
|
|
11
|
+
b: Second embedding vector.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
Cosine similarity score between -1 and 1.
|
|
15
|
+
"""
|
|
16
|
+
dot = sum(x * y for x, y in zip(a, b))
|
|
17
|
+
norm_a = sum(x * x for x in a) ** 0.5
|
|
18
|
+
norm_b = sum(x * x for x in b) ** 0.5
|
|
19
|
+
if norm_a == 0 or norm_b == 0:
|
|
20
|
+
return 0.0
|
|
21
|
+
return dot / (norm_a * norm_b)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def build_semantic_edges(
|
|
25
|
+
driver,
|
|
26
|
+
threshold: float = SIMILARITY_THRESHOLD,
|
|
27
|
+
weight: float = SEMANTIC_EDGE_WEIGHT,
|
|
28
|
+
) -> dict:
|
|
29
|
+
"""Build SIMILAR_TO edges between terms with similar embeddings.
|
|
30
|
+
|
|
31
|
+
Computes pairwise cosine similarity for all embedded terms and
|
|
32
|
+
creates edges where similarity exceeds the threshold. Skips pairs
|
|
33
|
+
that already have a PARENT_OF edge (structural edges take priority).
|
|
34
|
+
|
|
35
|
+
Only compares terms with the same embedding_dimension to avoid
|
|
36
|
+
cross-dimension comparisons.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
driver: Neo4j driver.
|
|
40
|
+
threshold: Minimum cosine similarity to create an edge.
|
|
41
|
+
weight: Edge weight for query-time weighting.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Dict with created, skipped_structural, and below_threshold counts.
|
|
45
|
+
"""
|
|
46
|
+
stats = {"created": 0, "skipped_structural": 0, "below_threshold": 0}
|
|
47
|
+
|
|
48
|
+
# Fetch all embedded terms grouped by dimension
|
|
49
|
+
with driver.session() as session:
|
|
50
|
+
result = session.run(
|
|
51
|
+
"""
|
|
52
|
+
MATCH (t:Term)
|
|
53
|
+
WHERE t.embedding IS NOT NULL
|
|
54
|
+
RETURN t.term_id AS tid, t.embedding AS emb,
|
|
55
|
+
t.embedding_dimension AS dim
|
|
56
|
+
ORDER BY t.term_id
|
|
57
|
+
"""
|
|
58
|
+
)
|
|
59
|
+
terms = [(r["tid"], r["emb"], r["dim"]) for r in result]
|
|
60
|
+
|
|
61
|
+
if len(terms) < 2:
|
|
62
|
+
print(" Not enough embedded terms for semantic edges")
|
|
63
|
+
return stats
|
|
64
|
+
|
|
65
|
+
# Group by dimension
|
|
66
|
+
by_dim: dict[int, list[tuple[int, list[float]]]] = {}
|
|
67
|
+
for tid, emb, dim in terms:
|
|
68
|
+
by_dim.setdefault(dim, []).append((tid, emb))
|
|
69
|
+
|
|
70
|
+
# Fetch existing structural edges for skip-check
|
|
71
|
+
with driver.session() as session:
|
|
72
|
+
result = session.run(
|
|
73
|
+
"""
|
|
74
|
+
MATCH (a:Term)-[:PARENT_OF]-(b:Term)
|
|
75
|
+
RETURN a.term_id AS a_id, b.term_id AS b_id
|
|
76
|
+
"""
|
|
77
|
+
)
|
|
78
|
+
structural_pairs = {
|
|
79
|
+
(min(r["a_id"], r["b_id"]), max(r["a_id"], r["b_id"]))
|
|
80
|
+
for r in result
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
# Clear existing semantic edges (rebuild fresh each run)
|
|
84
|
+
with driver.session() as session:
|
|
85
|
+
session.run("MATCH ()-[r:SIMILAR_TO]->() DELETE r")
|
|
86
|
+
|
|
87
|
+
for dim, dim_terms in by_dim.items():
|
|
88
|
+
print(f" Computing similarities for {len(dim_terms)} terms "
|
|
89
|
+
f"(dim={dim})...")
|
|
90
|
+
pairs_to_create = []
|
|
91
|
+
|
|
92
|
+
for i in range(len(dim_terms)):
|
|
93
|
+
tid_a, emb_a = dim_terms[i]
|
|
94
|
+
for j in range(i + 1, len(dim_terms)):
|
|
95
|
+
tid_b, emb_b = dim_terms[j]
|
|
96
|
+
|
|
97
|
+
# Skip if structural edge exists
|
|
98
|
+
pair_key = (min(tid_a, tid_b), max(tid_a, tid_b))
|
|
99
|
+
if pair_key in structural_pairs:
|
|
100
|
+
stats["skipped_structural"] += 1
|
|
101
|
+
continue
|
|
102
|
+
|
|
103
|
+
score = _cosine_similarity(emb_a, emb_b)
|
|
104
|
+
if score >= threshold:
|
|
105
|
+
pairs_to_create.append((tid_a, tid_b, score))
|
|
106
|
+
else:
|
|
107
|
+
stats["below_threshold"] += 1
|
|
108
|
+
|
|
109
|
+
# Batch-create edges
|
|
110
|
+
with driver.session() as session:
|
|
111
|
+
for tid_a, tid_b, score in pairs_to_create:
|
|
112
|
+
session.run(
|
|
113
|
+
"""
|
|
114
|
+
MATCH (a:Term {term_id: $a_id})
|
|
115
|
+
MATCH (b:Term {term_id: $b_id})
|
|
116
|
+
CREATE (a)-[:SIMILAR_TO {
|
|
117
|
+
score: $score,
|
|
118
|
+
weight: $weight,
|
|
119
|
+
edge_type: 'semantic',
|
|
120
|
+
created_at: datetime()
|
|
121
|
+
}]->(b)
|
|
122
|
+
""",
|
|
123
|
+
a_id=tid_a,
|
|
124
|
+
b_id=tid_b,
|
|
125
|
+
score=round(score, 4),
|
|
126
|
+
weight=weight,
|
|
127
|
+
)
|
|
128
|
+
stats["created"] += 1
|
|
129
|
+
|
|
130
|
+
return stats
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: agift-graph
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Australian Government Interactive Functions Thesaurus (AGIFT) as a Neo4j knowledge graph with embeddings and dual edge types
|
|
5
|
+
Author: AGIFT Graph Team
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/DeepCivic/AGIFT-graph-builder
|
|
8
|
+
Project-URL: Repository, https://github.com/DeepCivic/AGIFT-graph-builder
|
|
9
|
+
Project-URL: Issues, https://github.com/DeepCivic/AGIFT-graph-builder/issues
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Database
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
23
|
+
Requires-Python: >=3.8
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Requires-Dist: neo4j>=5.0.0
|
|
27
|
+
Provides-Extra: local
|
|
28
|
+
Requires-Dist: sentence-transformers>=2.2.0; extra == "local"
|
|
29
|
+
Provides-Extra: isaacus
|
|
30
|
+
Requires-Dist: isaacus>=0.1.0; extra == "isaacus"
|
|
31
|
+
Provides-Extra: all
|
|
32
|
+
Requires-Dist: sentence-transformers>=2.2.0; extra == "all"
|
|
33
|
+
Requires-Dist: isaacus>=0.1.0; extra == "all"
|
|
34
|
+
Provides-Extra: dev
|
|
35
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
36
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
37
|
+
Requires-Dist: flake8>=6.0.0; extra == "dev"
|
|
38
|
+
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
39
|
+
Dynamic: license-file
|
|
40
|
+
|
|
41
|
+
# AGIFT Graph
|
|
42
|
+
|
|
43
|
+
Australian Government Interactive Functions Thesaurus (AGIFT) as a Neo4j knowledge graph with embeddings and dual edge types.
|
|
44
|
+
|
|
45
|
+
## What it does
|
|
46
|
+
|
|
47
|
+
Fetches the full AGIFT vocabulary from the [TemaTres API](https://vocabularyserver.com/agift/), builds a Neo4j graph with structural hierarchy edges, generates embeddings (free local or Isaacus API), then creates semantic similarity edges between related terms.
|
|
48
|
+
|
|
49
|
+
```
|
|
50
|
+
TemaTres API ──► Neo4j Graph ──► Embeddings ──► Semantic Edges
|
|
51
|
+
(AGIFT) (PARENT_OF) (384/512/768d) (SIMILAR_TO)
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Graph model
|
|
55
|
+
|
|
56
|
+
Two edge types with different weights for query-time flexibility:
|
|
57
|
+
|
|
58
|
+
| Edge | Type | Weight | Description |
|
|
59
|
+
|------|------|--------|-------------|
|
|
60
|
+
| `PARENT_OF` | structural | 1.0 | AGIFT hierarchy (L1 → L2 → L3) |
|
|
61
|
+
| `SIMILAR_TO` | semantic | 0.5 | Cosine similarity above threshold |
|
|
62
|
+
|
|
63
|
+
Nodes carry DCAT-AP theme mappings for interoperability with European open data standards.
|
|
64
|
+
|
|
65
|
+
## Quick start
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
docker compose -f docker-compose.agift.yml up -d --build
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Then open the dashboard at http://localhost:5050 and click "Full Pipeline" or "Graph Only".
|
|
72
|
+
|
|
73
|
+
## Embedding providers
|
|
74
|
+
|
|
75
|
+
| Provider | Cost | Dimensions | Setup |
|
|
76
|
+
|----------|------|-----------|-------|
|
|
77
|
+
| local (sentence-transformers) | Free | 384, 768 | Nothing — runs on CPU |
|
|
78
|
+
| isaacus (kanon-2-embedder) | Paid | 256–1792 | Set API key in dashboard |
|
|
79
|
+
|
|
80
|
+
The local provider uses `all-MiniLM-L6-v2` (384d) or `all-mpnet-base-v2` (768d). Models are downloaded on first run and cached in a Docker volume.
|
|
81
|
+
|
|
82
|
+
## Configuration
|
|
83
|
+
|
|
84
|
+
Copy `.env.example` to `.env` and edit:
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
cp agift/.env.example .env
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
| Variable | Default | Description |
|
|
91
|
+
|----------|---------|-------------|
|
|
92
|
+
| `NEO4J_PASSWORD` | `changeme` | Neo4j database password |
|
|
93
|
+
| `ISAACUS_API_KEY` | (empty) | Isaacus API key (optional) |
|
|
94
|
+
|
|
95
|
+
All other settings (dimension, provider, similarity threshold, semantic edge weight) are configured via the dashboard UI and stored in Neo4j.
|
|
96
|
+
|
|
97
|
+
## Services
|
|
98
|
+
|
|
99
|
+
| Service | Port | Description |
|
|
100
|
+
|---------|------|-------------|
|
|
101
|
+
| Neo4j Browser | 7474 | Graph database UI |
|
|
102
|
+
| Neo4j Bolt | 7687 | Database protocol |
|
|
103
|
+
| Dashboard | 5050 | Config, run controls, logs |
|
|
104
|
+
|
|
105
|
+
## CLI usage
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
# Full pipeline (fetch + graph + embed + semantic edges)
|
|
109
|
+
docker exec agift-worker python import_agift.py
|
|
110
|
+
|
|
111
|
+
# Graph only (no embeddings)
|
|
112
|
+
docker exec agift-worker python import_agift.py --skip-embed --skip-semantic
|
|
113
|
+
|
|
114
|
+
# Local embeddings, 384 dimensions
|
|
115
|
+
docker exec agift-worker python import_agift.py --provider local --dimension 384
|
|
116
|
+
|
|
117
|
+
# Force re-embed all terms
|
|
118
|
+
docker exec agift-worker python import_agift.py --force-embed
|
|
119
|
+
|
|
120
|
+
# Dry run (fetch from API, no writes)
|
|
121
|
+
docker exec agift-worker python import_agift.py --dry-run
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## Docker Hub (no source code needed)
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
docker compose -f docker-compose.agift.hub.yml up -d
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## Project structure
|
|
131
|
+
|
|
132
|
+
```
|
|
133
|
+
agift/
|
|
134
|
+
├── import_agift.py # 4-stage pipeline (fetch/graph/embed/link)
|
|
135
|
+
├── dashboard/
|
|
136
|
+
│ ├── Dockerfile
|
|
137
|
+
│ ├── app.py # Flask dashboard + run controls
|
|
138
|
+
│ └── templates/
|
|
139
|
+
│ └── index.html
|
|
140
|
+
├── worker/
|
|
141
|
+
│ ├── Dockerfile
|
|
142
|
+
│ └── entrypoint.sh # Cron scheduler + manual trigger
|
|
143
|
+
├── .env.example
|
|
144
|
+
├── LICENSE # Apache 2.0
|
|
145
|
+
└── README.md
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
## Data source
|
|
149
|
+
|
|
150
|
+
AGIFT is maintained by the National Archives of Australia and published via TemaTres at https://vocabularyserver.com/agift/
|
|
151
|
+
|
|
152
|
+
## License
|
|
153
|
+
|
|
154
|
+
Apache 2.0 — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
agift/__init__.py,sha256=Nv3BHop1rxiR7ZHDG1FQwxEm0Ne7Tj5ilE1kXh5i7bI,1168
|
|
2
|
+
agift/__main__.py,sha256=U3MsYtYfPao2BWNUJcgFcmBuUO2ev-TVkdyUJaqppa4,76
|
|
3
|
+
agift/cli.py,sha256=0M1YRHrpM_wL5h5eH4pwy-JFBfB_tp88snE779w3Lig,7668
|
|
4
|
+
agift/common.py,sha256=y1XX-rEigUFFDcusHf_YMielxYaIqVRVls6pEqsBOS0,7277
|
|
5
|
+
agift/embed.py,sha256=lZ010c8EEvqM_L04Gti9RglOHj9eBZ1v3vt4obTFfSg,6582
|
|
6
|
+
agift/fetch.py,sha256=YR_9aD7t7Myx3UxfIJNXFMY2ebb_ibuV2sfcUyMud5k,4166
|
|
7
|
+
agift/graph.py,sha256=Y--d552q0EPD3muk4hSxoS2wJ5Dr7WBsNMYMz21iaHE,3604
|
|
8
|
+
agift/link.py,sha256=9StPtioJnHRdid6KvyBHxalDPyabv4Yz83CG9HN6_G4,4364
|
|
9
|
+
agift_graph-0.1.0.dist-info/licenses/LICENSE,sha256=PpmP6dhntyBzqd7X9IqL5rrImvJ3xMLX1tzEnmBJ7sM,613
|
|
10
|
+
agift_graph-0.1.0.dist-info/METADATA,sha256=pWLwI89HYwIUIWvx3W7x3DZJQm2be6Q0Pf1xJ9hrzIc,5283
|
|
11
|
+
agift_graph-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
12
|
+
agift_graph-0.1.0.dist-info/entry_points.txt,sha256=4u5vYSbfUW6ySI_t44bT-6OFDgSzmUwSiSmttYXY8lo,41
|
|
13
|
+
agift_graph-0.1.0.dist-info/top_level.txt,sha256=ysu-RL3KOGNhbHKZmJdOBK0-HojbWwIO69KcwB5AFoQ,6
|
|
14
|
+
agift_graph-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Apache License
|
|
2
|
+
Version 2.0, January 2004
|
|
3
|
+
http://www.apache.org/licenses/
|
|
4
|
+
|
|
5
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
|
+
you may not use this file except in compliance with the License.
|
|
7
|
+
You may obtain a copy of the License at
|
|
8
|
+
|
|
9
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
|
|
11
|
+
Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
See the License for the specific language governing permissions and
|
|
15
|
+
limitations under the License.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
agift
|