agift-graph 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agift_graph-0.1.0/LICENSE +15 -0
- agift_graph-0.1.0/PKG-INFO +154 -0
- agift_graph-0.1.0/README.md +114 -0
- agift_graph-0.1.0/agift/__init__.py +47 -0
- agift_graph-0.1.0/agift/__main__.py +5 -0
- agift_graph-0.1.0/agift/cli.py +200 -0
- agift_graph-0.1.0/agift/common.py +210 -0
- agift_graph-0.1.0/agift/embed.py +203 -0
- agift_graph-0.1.0/agift/fetch.py +122 -0
- agift_graph-0.1.0/agift/graph.py +105 -0
- agift_graph-0.1.0/agift/link.py +130 -0
- agift_graph-0.1.0/agift_graph.egg-info/PKG-INFO +154 -0
- agift_graph-0.1.0/agift_graph.egg-info/SOURCES.txt +17 -0
- agift_graph-0.1.0/agift_graph.egg-info/dependency_links.txt +1 -0
- agift_graph-0.1.0/agift_graph.egg-info/entry_points.txt +2 -0
- agift_graph-0.1.0/agift_graph.egg-info/requires.txt +17 -0
- agift_graph-0.1.0/agift_graph.egg-info/top_level.txt +1 -0
- agift_graph-0.1.0/pyproject.toml +81 -0
- agift_graph-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Apache License
|
|
2
|
+
Version 2.0, January 2004
|
|
3
|
+
http://www.apache.org/licenses/
|
|
4
|
+
|
|
5
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
|
+
you may not use this file except in compliance with the License.
|
|
7
|
+
You may obtain a copy of the License at
|
|
8
|
+
|
|
9
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
|
|
11
|
+
Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
See the License for the specific language governing permissions and
|
|
15
|
+
limitations under the License.
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: agift-graph
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Australian Government Interactive Functions Thesaurus (AGIFT) as a Neo4j knowledge graph with embeddings and dual edge types
|
|
5
|
+
Author: AGIFT Graph Team
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/DeepCivic/AGIFT-graph-builder
|
|
8
|
+
Project-URL: Repository, https://github.com/DeepCivic/AGIFT-graph-builder
|
|
9
|
+
Project-URL: Issues, https://github.com/DeepCivic/AGIFT-graph-builder/issues
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Database
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
23
|
+
Requires-Python: >=3.8
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Requires-Dist: neo4j>=5.0.0
|
|
27
|
+
Provides-Extra: local
|
|
28
|
+
Requires-Dist: sentence-transformers>=2.2.0; extra == "local"
|
|
29
|
+
Provides-Extra: isaacus
|
|
30
|
+
Requires-Dist: isaacus>=0.1.0; extra == "isaacus"
|
|
31
|
+
Provides-Extra: all
|
|
32
|
+
Requires-Dist: sentence-transformers>=2.2.0; extra == "all"
|
|
33
|
+
Requires-Dist: isaacus>=0.1.0; extra == "all"
|
|
34
|
+
Provides-Extra: dev
|
|
35
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
36
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
37
|
+
Requires-Dist: flake8>=6.0.0; extra == "dev"
|
|
38
|
+
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
39
|
+
Dynamic: license-file
|
|
40
|
+
|
|
41
|
+
# AGIFT Graph
|
|
42
|
+
|
|
43
|
+
Australian Government Interactive Functions Thesaurus (AGIFT) as a Neo4j knowledge graph with embeddings and dual edge types.
|
|
44
|
+
|
|
45
|
+
## What it does
|
|
46
|
+
|
|
47
|
+
Fetches the full AGIFT vocabulary from the [TemaTres API](https://vocabularyserver.com/agift/), builds a Neo4j graph with structural hierarchy edges, generates embeddings (free local or Isaacus API), then creates semantic similarity edges between related terms.
|
|
48
|
+
|
|
49
|
+
```
|
|
50
|
+
TemaTres API ──► Neo4j Graph ──► Embeddings ──► Semantic Edges
|
|
51
|
+
(AGIFT) (PARENT_OF) (384/512/768d) (SIMILAR_TO)
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Graph model
|
|
55
|
+
|
|
56
|
+
Two edge types with different weights for query-time flexibility:
|
|
57
|
+
|
|
58
|
+
| Edge | Type | Weight | Description |
|
|
59
|
+
|------|------|--------|-------------|
|
|
60
|
+
| `PARENT_OF` | structural | 1.0 | AGIFT hierarchy (L1 → L2 → L3) |
|
|
61
|
+
| `SIMILAR_TO` | semantic | 0.5 | Cosine similarity above threshold |
|
|
62
|
+
|
|
63
|
+
Nodes carry DCAT-AP theme mappings for interoperability with European open data standards.
|
|
64
|
+
|
|
65
|
+
## Quick start
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
docker compose -f docker-compose.agift.yml up -d --build
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Then open the dashboard at http://localhost:5050 and click "Full Pipeline" or "Graph Only".
|
|
72
|
+
|
|
73
|
+
## Embedding providers
|
|
74
|
+
|
|
75
|
+
| Provider | Cost | Dimensions | Setup |
|
|
76
|
+
|----------|------|-----------|-------|
|
|
77
|
+
| local (sentence-transformers) | Free | 384, 768 | Nothing — runs on CPU |
|
|
78
|
+
| isaacus (kanon-2-embedder) | Paid | 256–1792 | Set API key in dashboard |
|
|
79
|
+
|
|
80
|
+
The local provider uses `all-MiniLM-L6-v2` (384d) or `all-mpnet-base-v2` (768d). Models are downloaded on first run and cached in a Docker volume.
|
|
81
|
+
|
|
82
|
+
## Configuration
|
|
83
|
+
|
|
84
|
+
Copy `.env.example` to `.env` and edit:
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
cp agift/.env.example .env
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
| Variable | Default | Description |
|
|
91
|
+
|----------|---------|-------------|
|
|
92
|
+
| `NEO4J_PASSWORD` | `changeme` | Neo4j database password |
|
|
93
|
+
| `ISAACUS_API_KEY` | (empty) | Isaacus API key (optional) |
|
|
94
|
+
|
|
95
|
+
All other settings (dimension, provider, similarity threshold, semantic edge weight) are configured via the dashboard UI and stored in Neo4j.
|
|
96
|
+
|
|
97
|
+
## Services
|
|
98
|
+
|
|
99
|
+
| Service | Port | Description |
|
|
100
|
+
|---------|------|-------------|
|
|
101
|
+
| Neo4j Browser | 7474 | Graph database UI |
|
|
102
|
+
| Neo4j Bolt | 7687 | Database protocol |
|
|
103
|
+
| Dashboard | 5050 | Config, run controls, logs |
|
|
104
|
+
|
|
105
|
+
## CLI usage
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
# Full pipeline (fetch + graph + embed + semantic edges)
|
|
109
|
+
docker exec agift-worker python import_agift.py
|
|
110
|
+
|
|
111
|
+
# Graph only (no embeddings)
|
|
112
|
+
docker exec agift-worker python import_agift.py --skip-embed --skip-semantic
|
|
113
|
+
|
|
114
|
+
# Local embeddings, 384 dimensions
|
|
115
|
+
docker exec agift-worker python import_agift.py --provider local --dimension 384
|
|
116
|
+
|
|
117
|
+
# Force re-embed all terms
|
|
118
|
+
docker exec agift-worker python import_agift.py --force-embed
|
|
119
|
+
|
|
120
|
+
# Dry run (fetch from API, no writes)
|
|
121
|
+
docker exec agift-worker python import_agift.py --dry-run
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## Docker Hub (no source code needed)
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
docker compose -f docker-compose.agift.hub.yml up -d
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## Project structure
|
|
131
|
+
|
|
132
|
+
```
|
|
133
|
+
agift/
|
|
134
|
+
├── import_agift.py # 4-stage pipeline (fetch/graph/embed/link)
|
|
135
|
+
├── dashboard/
|
|
136
|
+
│ ├── Dockerfile
|
|
137
|
+
│ ├── app.py # Flask dashboard + run controls
|
|
138
|
+
│ └── templates/
|
|
139
|
+
│ └── index.html
|
|
140
|
+
├── worker/
|
|
141
|
+
│ ├── Dockerfile
|
|
142
|
+
│ └── entrypoint.sh # Cron scheduler + manual trigger
|
|
143
|
+
├── .env.example
|
|
144
|
+
├── LICENSE # Apache 2.0
|
|
145
|
+
└── README.md
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
## Data source
|
|
149
|
+
|
|
150
|
+
AGIFT is maintained by the National Archives of Australia and published via TemaTres at https://vocabularyserver.com/agift/
|
|
151
|
+
|
|
152
|
+
## License
|
|
153
|
+
|
|
154
|
+
Apache 2.0 — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# AGIFT Graph
|
|
2
|
+
|
|
3
|
+
Australian Government Interactive Functions Thesaurus (AGIFT) as a Neo4j knowledge graph with embeddings and dual edge types.
|
|
4
|
+
|
|
5
|
+
## What it does
|
|
6
|
+
|
|
7
|
+
Fetches the full AGIFT vocabulary from the [TemaTres API](https://vocabularyserver.com/agift/), builds a Neo4j graph with structural hierarchy edges, generates embeddings (free local or Isaacus API), then creates semantic similarity edges between related terms.
|
|
8
|
+
|
|
9
|
+
```
|
|
10
|
+
TemaTres API ──► Neo4j Graph ──► Embeddings ──► Semantic Edges
|
|
11
|
+
(AGIFT) (PARENT_OF) (384/512/768d) (SIMILAR_TO)
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## Graph model
|
|
15
|
+
|
|
16
|
+
Two edge types with different weights for query-time flexibility:
|
|
17
|
+
|
|
18
|
+
| Edge | Type | Weight | Description |
|
|
19
|
+
|------|------|--------|-------------|
|
|
20
|
+
| `PARENT_OF` | structural | 1.0 | AGIFT hierarchy (L1 → L2 → L3) |
|
|
21
|
+
| `SIMILAR_TO` | semantic | 0.5 | Cosine similarity above threshold |
|
|
22
|
+
|
|
23
|
+
Nodes carry DCAT-AP theme mappings for interoperability with European open data standards.
|
|
24
|
+
|
|
25
|
+
## Quick start
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
docker compose -f docker-compose.agift.yml up -d --build
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
Then open the dashboard at http://localhost:5050 and click "Full Pipeline" or "Graph Only".
|
|
32
|
+
|
|
33
|
+
## Embedding providers
|
|
34
|
+
|
|
35
|
+
| Provider | Cost | Dimensions | Setup |
|
|
36
|
+
|----------|------|-----------|-------|
|
|
37
|
+
| local (sentence-transformers) | Free | 384, 768 | Nothing — runs on CPU |
|
|
38
|
+
| isaacus (kanon-2-embedder) | Paid | 256–1792 | Set API key in dashboard |
|
|
39
|
+
|
|
40
|
+
The local provider uses `all-MiniLM-L6-v2` (384d) or `all-mpnet-base-v2` (768d). Models are downloaded on first run and cached in a Docker volume.
|
|
41
|
+
|
|
42
|
+
## Configuration
|
|
43
|
+
|
|
44
|
+
Copy `.env.example` to `.env` and edit:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
cp agift/.env.example .env
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
| Variable | Default | Description |
|
|
51
|
+
|----------|---------|-------------|
|
|
52
|
+
| `NEO4J_PASSWORD` | `changeme` | Neo4j database password |
|
|
53
|
+
| `ISAACUS_API_KEY` | (empty) | Isaacus API key (optional) |
|
|
54
|
+
|
|
55
|
+
All other settings (dimension, provider, similarity threshold, semantic edge weight) are configured via the dashboard UI and stored in Neo4j.
|
|
56
|
+
|
|
57
|
+
## Services
|
|
58
|
+
|
|
59
|
+
| Service | Port | Description |
|
|
60
|
+
|---------|------|-------------|
|
|
61
|
+
| Neo4j Browser | 7474 | Graph database UI |
|
|
62
|
+
| Neo4j Bolt | 7687 | Database protocol |
|
|
63
|
+
| Dashboard | 5050 | Config, run controls, logs |
|
|
64
|
+
|
|
65
|
+
## CLI usage
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
# Full pipeline (fetch + graph + embed + semantic edges)
|
|
69
|
+
docker exec agift-worker python import_agift.py
|
|
70
|
+
|
|
71
|
+
# Graph only (no embeddings)
|
|
72
|
+
docker exec agift-worker python import_agift.py --skip-embed --skip-semantic
|
|
73
|
+
|
|
74
|
+
# Local embeddings, 384 dimensions
|
|
75
|
+
docker exec agift-worker python import_agift.py --provider local --dimension 384
|
|
76
|
+
|
|
77
|
+
# Force re-embed all terms
|
|
78
|
+
docker exec agift-worker python import_agift.py --force-embed
|
|
79
|
+
|
|
80
|
+
# Dry run (fetch from API, no writes)
|
|
81
|
+
docker exec agift-worker python import_agift.py --dry-run
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Docker Hub (no source code needed)
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
docker compose -f docker-compose.agift.hub.yml up -d
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Project structure
|
|
91
|
+
|
|
92
|
+
```
|
|
93
|
+
agift/
|
|
94
|
+
├── import_agift.py # 4-stage pipeline (fetch/graph/embed/link)
|
|
95
|
+
├── dashboard/
|
|
96
|
+
│ ├── Dockerfile
|
|
97
|
+
│ ├── app.py # Flask dashboard + run controls
|
|
98
|
+
│ └── templates/
|
|
99
|
+
│ └── index.html
|
|
100
|
+
├── worker/
|
|
101
|
+
│ ├── Dockerfile
|
|
102
|
+
│ └── entrypoint.sh # Cron scheduler + manual trigger
|
|
103
|
+
├── .env.example
|
|
104
|
+
├── LICENSE # Apache 2.0
|
|
105
|
+
└── README.md
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## Data source
|
|
109
|
+
|
|
110
|
+
AGIFT is maintained by the National Archives of Australia and published via TemaTres at https://vocabularyserver.com/agift/
|
|
111
|
+
|
|
112
|
+
## License
|
|
113
|
+
|
|
114
|
+
Apache 2.0 — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""AGIFT Graph Builder — modular ETL pipeline for Neo4j."""
|
|
2
|
+
|
|
3
|
+
from agift.common import (
|
|
4
|
+
AGIFT_TOP_TO_DCAT,
|
|
5
|
+
LOCAL_MODELS,
|
|
6
|
+
PROVIDER_ISAACUS,
|
|
7
|
+
PROVIDER_LOCAL,
|
|
8
|
+
SEMANTIC_EDGE_WEIGHT,
|
|
9
|
+
SIMILARITY_THRESHOLD,
|
|
10
|
+
STRUCTURAL_EDGE_WEIGHT,
|
|
11
|
+
TEMATRES_BASE,
|
|
12
|
+
VALID_DIMENSIONS,
|
|
13
|
+
VALID_PROVIDERS,
|
|
14
|
+
get_config_from_neo4j,
|
|
15
|
+
get_neo4j_driver,
|
|
16
|
+
log_run,
|
|
17
|
+
print_summary,
|
|
18
|
+
)
|
|
19
|
+
from agift.fetch import AgiftTerm, fetch_full_hierarchy
|
|
20
|
+
from agift.graph import ensure_schema, upsert_graph
|
|
21
|
+
from agift.embed import build_hierarchical_text, embed_terms, embed_terms_local
|
|
22
|
+
from agift.link import build_semantic_edges
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"AGIFT_TOP_TO_DCAT",
|
|
26
|
+
"AgiftTerm",
|
|
27
|
+
"LOCAL_MODELS",
|
|
28
|
+
"PROVIDER_ISAACUS",
|
|
29
|
+
"PROVIDER_LOCAL",
|
|
30
|
+
"SEMANTIC_EDGE_WEIGHT",
|
|
31
|
+
"SIMILARITY_THRESHOLD",
|
|
32
|
+
"STRUCTURAL_EDGE_WEIGHT",
|
|
33
|
+
"TEMATRES_BASE",
|
|
34
|
+
"VALID_DIMENSIONS",
|
|
35
|
+
"VALID_PROVIDERS",
|
|
36
|
+
"build_hierarchical_text",
|
|
37
|
+
"build_semantic_edges",
|
|
38
|
+
"embed_terms",
|
|
39
|
+
"embed_terms_local",
|
|
40
|
+
"ensure_schema",
|
|
41
|
+
"fetch_full_hierarchy",
|
|
42
|
+
"get_config_from_neo4j",
|
|
43
|
+
"get_neo4j_driver",
|
|
44
|
+
"log_run",
|
|
45
|
+
"print_summary",
|
|
46
|
+
"upsert_graph",
|
|
47
|
+
]
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CLI entry point for AGIFT import pipeline.
|
|
3
|
+
|
|
4
|
+
Usage (via pip install):
|
|
5
|
+
agift
|
|
6
|
+
agift --dry-run
|
|
7
|
+
agift --skip-embed
|
|
8
|
+
agift --force-embed
|
|
9
|
+
agift --skip-semantic
|
|
10
|
+
|
|
11
|
+
Usage (direct):
|
|
12
|
+
python -m agift
|
|
13
|
+
python import_agift.py
|
|
14
|
+
|
|
15
|
+
Source: https://vocabularyserver.com/agift/services.php
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import argparse
|
|
19
|
+
import os
|
|
20
|
+
import sys
|
|
21
|
+
import time
|
|
22
|
+
from datetime import datetime, timezone
|
|
23
|
+
|
|
24
|
+
from agift.common import (
|
|
25
|
+
PROVIDER_ISAACUS,
|
|
26
|
+
PROVIDER_LOCAL,
|
|
27
|
+
TEMATRES_BASE,
|
|
28
|
+
VALID_DIMENSIONS,
|
|
29
|
+
VALID_PROVIDERS,
|
|
30
|
+
get_config_from_neo4j,
|
|
31
|
+
get_neo4j_driver,
|
|
32
|
+
log_run,
|
|
33
|
+
print_summary,
|
|
34
|
+
)
|
|
35
|
+
from agift.fetch import fetch_full_hierarchy
|
|
36
|
+
from agift.graph import ensure_schema, upsert_graph
|
|
37
|
+
from agift.embed import embed_terms, embed_terms_local
|
|
38
|
+
from agift.link import build_semantic_edges
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def main():
|
|
42
|
+
"""CLI entry point for AGIFT import pipeline."""
|
|
43
|
+
parser = argparse.ArgumentParser(
|
|
44
|
+
description="Import AGIFT vocabulary into Neo4j graph with embeddings"
|
|
45
|
+
)
|
|
46
|
+
parser.add_argument("--dry-run", action="store_true",
|
|
47
|
+
help="Fetch from API but don't write to Neo4j")
|
|
48
|
+
parser.add_argument("--skip-alt", action="store_true",
|
|
49
|
+
help="Skip fetching alt labels (faster)")
|
|
50
|
+
parser.add_argument("--skip-embed", action="store_true",
|
|
51
|
+
help="Skip embedding generation")
|
|
52
|
+
parser.add_argument("--force-embed", action="store_true",
|
|
53
|
+
help="Re-embed all terms, not just new/changed")
|
|
54
|
+
parser.add_argument("--skip-semantic", action="store_true",
|
|
55
|
+
help="Skip semantic edge generation")
|
|
56
|
+
parser.add_argument("--provider", choices=list(VALID_PROVIDERS),
|
|
57
|
+
help="Override embedding provider (isaacus or local)")
|
|
58
|
+
parser.add_argument("--dimension", type=int, choices=list(VALID_DIMENSIONS),
|
|
59
|
+
help="Override embedding dimension")
|
|
60
|
+
parser.add_argument("--threshold", type=float, default=None,
|
|
61
|
+
help="Cosine similarity threshold for semantic edges")
|
|
62
|
+
args = parser.parse_args()
|
|
63
|
+
|
|
64
|
+
started_at = datetime.now(timezone.utc).isoformat()
|
|
65
|
+
run_details = {"started_at": started_at}
|
|
66
|
+
|
|
67
|
+
print("=" * 60)
|
|
68
|
+
print("AGIFT Vocabulary Import (Neo4j + Embeddings + Semantic Edges)")
|
|
69
|
+
print("=" * 60)
|
|
70
|
+
print(f"Source: {TEMATRES_BASE}")
|
|
71
|
+
print()
|
|
72
|
+
|
|
73
|
+
# Stage 1: Fetch from TemaTres
|
|
74
|
+
start = time.time()
|
|
75
|
+
terms = fetch_full_hierarchy(include_alts=not args.skip_alt)
|
|
76
|
+
elapsed = time.time() - start
|
|
77
|
+
print(f"\nFetched {len(terms)} terms in {elapsed:.1f}s")
|
|
78
|
+
run_details["fetched"] = len(terms)
|
|
79
|
+
|
|
80
|
+
total_alts = sum(len(t.alt_labels) for t in terms)
|
|
81
|
+
print(f"Total alt labels: {total_alts}")
|
|
82
|
+
|
|
83
|
+
if args.dry_run:
|
|
84
|
+
print("\n[DRY RUN] Skipping Neo4j write and embedding.")
|
|
85
|
+
for t in terms[:10]:
|
|
86
|
+
alts = f" (alts: {', '.join(t.alt_labels)})" if t.alt_labels else ""
|
|
87
|
+
print(f" L{t.depth} [{t.dcat_theme}] {t.label}{alts}")
|
|
88
|
+
print(f" ... and {len(terms) - 10} more")
|
|
89
|
+
return
|
|
90
|
+
|
|
91
|
+
# Connect to Neo4j
|
|
92
|
+
print("\nConnecting to Neo4j...")
|
|
93
|
+
try:
|
|
94
|
+
driver = get_neo4j_driver()
|
|
95
|
+
driver.verify_connectivity()
|
|
96
|
+
except Exception as e:
|
|
97
|
+
print(f"ERROR: Cannot connect to Neo4j: {e}")
|
|
98
|
+
sys.exit(1)
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
# Stage 2: Build graph (structural edges)
|
|
102
|
+
print("\nStage 2: Building graph (structural edges)...")
|
|
103
|
+
ensure_schema(driver)
|
|
104
|
+
graph_stats = upsert_graph(driver, terms)
|
|
105
|
+
print(f" Created: {graph_stats['created']}")
|
|
106
|
+
print(f" Updated: {graph_stats['updated']}")
|
|
107
|
+
print(f" Unchanged: {graph_stats['unchanged']}")
|
|
108
|
+
run_details.update({
|
|
109
|
+
"created": graph_stats["created"],
|
|
110
|
+
"updated": graph_stats["updated"],
|
|
111
|
+
"unchanged": graph_stats["unchanged"],
|
|
112
|
+
})
|
|
113
|
+
|
|
114
|
+
# Stage 3: Embed
|
|
115
|
+
if args.skip_embed:
|
|
116
|
+
print("\nStage 3: Skipped (--skip-embed)")
|
|
117
|
+
run_details["embedded"] = 0
|
|
118
|
+
run_details["embed_failed"] = 0
|
|
119
|
+
run_details["embedding_provider"] = ""
|
|
120
|
+
else:
|
|
121
|
+
config = get_config_from_neo4j(driver)
|
|
122
|
+
provider = args.provider or config["embedding_provider"]
|
|
123
|
+
dimension = args.dimension or config["embedding_dimension"]
|
|
124
|
+
run_details["embedding_provider"] = provider
|
|
125
|
+
|
|
126
|
+
# Determine which term IDs to embed
|
|
127
|
+
if args.force_embed:
|
|
128
|
+
with driver.session() as session:
|
|
129
|
+
result = session.run(
|
|
130
|
+
"MATCH (t:Term) RETURN t.term_id AS tid"
|
|
131
|
+
)
|
|
132
|
+
embed_ids = [r["tid"] for r in result]
|
|
133
|
+
else:
|
|
134
|
+
embed_ids = graph_stats["changed_ids"]
|
|
135
|
+
|
|
136
|
+
if not embed_ids:
|
|
137
|
+
print("\nStage 3: No terms need embedding")
|
|
138
|
+
run_details["embedded"] = 0
|
|
139
|
+
run_details["embed_failed"] = 0
|
|
140
|
+
elif provider == PROVIDER_LOCAL:
|
|
141
|
+
print(f"\nStage 3: Local embedding {len(embed_ids)} terms "
|
|
142
|
+
f"(dimension={dimension})...")
|
|
143
|
+
embed_stats = embed_terms_local(driver, embed_ids, dimension)
|
|
144
|
+
print(f" Embedded: {embed_stats['embedded']}")
|
|
145
|
+
print(f" Failed: {embed_stats['failed']}")
|
|
146
|
+
run_details["embedded"] = embed_stats["embedded"]
|
|
147
|
+
run_details["embed_failed"] = embed_stats["failed"]
|
|
148
|
+
else:
|
|
149
|
+
# Isaacus provider
|
|
150
|
+
api_key = config["isaacus_api_key"] or os.environ.get("ISAACUS_API_KEY")
|
|
151
|
+
if not api_key:
|
|
152
|
+
print("\nStage 3: Skipped (no Isaacus API key configured)")
|
|
153
|
+
print(" Set via dashboard, or use --provider local")
|
|
154
|
+
run_details["embedded"] = 0
|
|
155
|
+
run_details["embed_failed"] = 0
|
|
156
|
+
else:
|
|
157
|
+
print(f"\nStage 3: Isaacus embedding {len(embed_ids)} terms "
|
|
158
|
+
f"(dimension={dimension})...")
|
|
159
|
+
embed_stats = embed_terms(driver, embed_ids, api_key, dimension)
|
|
160
|
+
print(f" Embedded: {embed_stats['embedded']}")
|
|
161
|
+
print(f" Failed: {embed_stats['failed']}")
|
|
162
|
+
run_details["embedded"] = embed_stats["embedded"]
|
|
163
|
+
run_details["embed_failed"] = embed_stats["failed"]
|
|
164
|
+
|
|
165
|
+
# Stage 4: Semantic edges
|
|
166
|
+
if args.skip_semantic:
|
|
167
|
+
print("\nStage 4: Skipped (--skip-semantic)")
|
|
168
|
+
run_details["semantic_edges_created"] = 0
|
|
169
|
+
else:
|
|
170
|
+
config = get_config_from_neo4j(driver)
|
|
171
|
+
threshold = args.threshold or config["similarity_threshold"]
|
|
172
|
+
sem_weight = config["semantic_edge_weight"]
|
|
173
|
+
|
|
174
|
+
print(f"\nStage 4: Building semantic edges "
|
|
175
|
+
f"(threshold={threshold}, weight={sem_weight})...")
|
|
176
|
+
sem_stats = build_semantic_edges(driver, threshold, sem_weight)
|
|
177
|
+
print(f" Created: {sem_stats['created']}")
|
|
178
|
+
print(f" Skipped (structural): {sem_stats['skipped_structural']}")
|
|
179
|
+
print(f" Below threshold: {sem_stats['below_threshold']}")
|
|
180
|
+
run_details["semantic_edges_created"] = sem_stats["created"]
|
|
181
|
+
|
|
182
|
+
print_summary(driver)
|
|
183
|
+
log_run(driver, "success", run_details)
|
|
184
|
+
|
|
185
|
+
except Exception as e:
|
|
186
|
+
print(f"\nERROR: {e}")
|
|
187
|
+
run_details["error"] = str(e)
|
|
188
|
+
try:
|
|
189
|
+
log_run(driver, "error", run_details)
|
|
190
|
+
except Exception:
|
|
191
|
+
pass
|
|
192
|
+
sys.exit(1)
|
|
193
|
+
finally:
|
|
194
|
+
driver.close()
|
|
195
|
+
|
|
196
|
+
print("\nDone.")
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
if __name__ == "__main__":
|
|
200
|
+
main()
|