arangodb-schema-analyzer 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. arangodb_schema_analyzer-0.3.0/.gitignore +43 -0
  2. arangodb_schema_analyzer-0.3.0/CHANGELOG.md +129 -0
  3. arangodb_schema_analyzer-0.3.0/CONTRIBUTING.md +97 -0
  4. arangodb_schema_analyzer-0.3.0/LICENSE +174 -0
  5. arangodb_schema_analyzer-0.3.0/PKG-INFO +198 -0
  6. arangodb_schema_analyzer-0.3.0/README.md +152 -0
  7. arangodb_schema_analyzer-0.3.0/SECURITY.md +13 -0
  8. arangodb_schema_analyzer-0.3.0/docs/tool-contract/v1/README.md +42 -0
  9. arangodb_schema_analyzer-0.3.0/docs/tool-contract/v1/examples/request.analyze.json +31 -0
  10. arangodb_schema_analyzer-0.3.0/docs/tool-contract/v1/examples/response.analyze.json +29 -0
  11. arangodb_schema_analyzer-0.3.0/docs/tool-contract/v1/request.schema.json +186 -0
  12. arangodb_schema_analyzer-0.3.0/docs/tool-contract/v1/response.schema.json +200 -0
  13. arangodb_schema_analyzer-0.3.0/pyproject.toml +113 -0
  14. arangodb_schema_analyzer-0.3.0/schema_analyzer/__init__.py +28 -0
  15. arangodb_schema_analyzer-0.3.0/schema_analyzer/analyzer.py +653 -0
  16. arangodb_schema_analyzer-0.3.0/schema_analyzer/baseline.py +438 -0
  17. arangodb_schema_analyzer-0.3.0/schema_analyzer/cache.py +72 -0
  18. arangodb_schema_analyzer-0.3.0/schema_analyzer/cli.py +135 -0
  19. arangodb_schema_analyzer-0.3.0/schema_analyzer/conceptual.py +93 -0
  20. arangodb_schema_analyzer-0.3.0/schema_analyzer/defaults.py +70 -0
  21. arangodb_schema_analyzer-0.3.0/schema_analyzer/docs.py +60 -0
  22. arangodb_schema_analyzer-0.3.0/schema_analyzer/domain_detect.py +333 -0
  23. arangodb_schema_analyzer-0.3.0/schema_analyzer/errors.py +19 -0
  24. arangodb_schema_analyzer-0.3.0/schema_analyzer/eval/__init__.py +14 -0
  25. arangodb_schema_analyzer-0.3.0/schema_analyzer/eval/domain_loader.py +26 -0
  26. arangodb_schema_analyzer-0.3.0/schema_analyzer/eval/generator.py +217 -0
  27. arangodb_schema_analyzer-0.3.0/schema_analyzer/eval/runner.py +181 -0
  28. arangodb_schema_analyzer-0.3.0/schema_analyzer/eval/scoring.py +237 -0
  29. arangodb_schema_analyzer-0.3.0/schema_analyzer/exports.py +23 -0
  30. arangodb_schema_analyzer-0.3.0/schema_analyzer/mapping.py +153 -0
  31. arangodb_schema_analyzer-0.3.0/schema_analyzer/mcp_server.py +79 -0
  32. arangodb_schema_analyzer-0.3.0/schema_analyzer/owl_export.py +120 -0
  33. arangodb_schema_analyzer-0.3.0/schema_analyzer/providers/__init__.py +83 -0
  34. arangodb_schema_analyzer-0.3.0/schema_analyzer/providers/anthropic_provider.py +70 -0
  35. arangodb_schema_analyzer-0.3.0/schema_analyzer/providers/base.py +33 -0
  36. arangodb_schema_analyzer-0.3.0/schema_analyzer/providers/openai_provider.py +63 -0
  37. arangodb_schema_analyzer-0.3.0/schema_analyzer/providers/openrouter_provider.py +83 -0
  38. arangodb_schema_analyzer-0.3.0/schema_analyzer/py.typed +0 -0
  39. arangodb_schema_analyzer-0.3.0/schema_analyzer/reconcile.py +169 -0
  40. arangodb_schema_analyzer-0.3.0/schema_analyzer/snapshot.py +876 -0
  41. arangodb_schema_analyzer-0.3.0/schema_analyzer/statistics.py +261 -0
  42. arangodb_schema_analyzer-0.3.0/schema_analyzer/tool.py +285 -0
  43. arangodb_schema_analyzer-0.3.0/schema_analyzer/tool_contract/__init__.py +1 -0
  44. arangodb_schema_analyzer-0.3.0/schema_analyzer/tool_contract/v1/__init__.py +1 -0
  45. arangodb_schema_analyzer-0.3.0/schema_analyzer/tool_contract/v1/request.schema.json +186 -0
  46. arangodb_schema_analyzer-0.3.0/schema_analyzer/tool_contract/v1/response.schema.json +265 -0
  47. arangodb_schema_analyzer-0.3.0/schema_analyzer/tool_contract_v1.py +48 -0
  48. arangodb_schema_analyzer-0.3.0/schema_analyzer/types.py +57 -0
  49. arangodb_schema_analyzer-0.3.0/schema_analyzer/utils.py +118 -0
  50. arangodb_schema_analyzer-0.3.0/schema_analyzer/validation.py +170 -0
  51. arangodb_schema_analyzer-0.3.0/schema_analyzer/workflow.py +208 -0
@@ -0,0 +1,43 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.pyd
5
+ *.so
6
+ .Python
7
+ build/
8
+ dist/
9
+ *.egg-info/
10
+ .eggs/
11
+
12
+ # Virtual environments
13
+ .venv/
14
+ .venv-*/
15
+ venv/
16
+ env/
17
+
18
+ # Build / packaging
19
+ dist/
20
+ wheels/
21
+
22
+ # Test / tooling
23
+ .pytest_cache/
24
+ .mypy_cache/
25
+ .ruff_cache/
26
+ .coverage
27
+ coverage.xml
28
+
29
+ # OS / editor
30
+ .DS_Store
31
+ .idea/
32
+ .vscode/
33
+
34
+ # Project-specific
35
+ .schema-analyzer-cache/
36
+ .env
37
+ .env.*
38
+ !.env.example
39
+
40
+ # Eval outputs
41
+ eval_report*.json
42
+ eval_*.json
43
+
@@ -0,0 +1,129 @@
1
+ # Changelog
2
+
3
+ ## 0.3.0
4
+
5
+ First PyPI release. Consolidates the quality + contract work originally
6
+ slated for `0.2.0` (issues #2-#6) with the cheap schema-change probes
7
+ (#7) and PRD amendment (#8) that landed on `main` shortly after.
8
+ Version `0.2.0` was prepared on `main` but never published; `0.3.0` is
9
+ the first tag to reach PyPI.
10
+
11
+ ### Tool contract changes (breaking)
12
+
13
+ - **#6 key rename.** Property mappings now emit `field` (was
14
+ `physicalFieldName`). Relationship mappings now emit
15
+ `edgeCollectionName` and MUST NOT emit `collectionName` — the JSON
16
+ schema rejects the latter. Entity mappings still use
17
+ `collectionName` (unchanged). See `tool_contract/v1/response.schema.json`.
18
+ Consumers that previously ran a `_normalize_analyzer_pm` /
19
+ `_normalize_props` shim can delete it.
20
+
21
+ ### New features
22
+
23
+ - **#7 cheap schema-change probes.** Two new top-level helpers in
24
+ `schema_analyzer.snapshot` (re-exported from the package root):
25
+ - `fingerprint_physical_shape(db, *, exclude_collections=None)` — hashes
26
+ only the user-collection set, per-collection type (document vs edge),
27
+ and per-collection sorted index digests (`type`, `fields`, `unique`,
28
+ `sparse`, `vci`, `deduplicate`). Auto-generated index `name` / `id`
29
+ are excluded so restarts and rebuilds don't produce false positives.
30
+ Stable under ordinary INSERT / UPDATE / REMOVE writes.
31
+ - `fingerprint_physical_counts(db, *, exclude_collections=None)` —
32
+ shape fingerprint combined with `col.count()` per included
33
+ collection; changes whenever the shape or any row count changes.
34
+ Both probes read only python-arango primitives (`db.collections()`,
35
+ `col.indexes()`, `col.count()`) — no AQL, no samples, no analyzer
36
+ logic — so consumers can answer "has it changed?" in a few dozen
37
+ milliseconds instead of running the full `snapshot_physical_schema`.
38
+ Collection-level failures degrade gracefully (sentinel contribution)
39
+ rather than raising. `exclude_collections` lets callers using a
40
+ database-resident cache self-exclude their bookkeeping collection.
41
+ - **#3 statistics block.** `AgenticSchemaAnalyzer` now stamps
42
+ `metadata.statistics` with per-collection counts, per-entity
43
+ `estimated_count`, and a per-relationship bundle of `edge_count`,
44
+ `source_count`, `target_count`, `avg_out_degree`, `avg_in_degree`,
45
+ `cardinality_pattern` (`1:1` / `1:N` / `N:1` / `N:M`) and
46
+ `selectivity`. When no live DB is available
47
+ `metadata.statistics_status = "skipped_no_db"` and `statistics` is
48
+ absent. Bounded AQL cost: one `LENGTH` per collection, one filtered
49
+ `COLLECT` per LABEL / GENERIC_WITH_TYPE subset.
50
+ - **#5 reconciliation step.** After the LLM returns, the analyzer
51
+ diffs its collection coverage against the snapshot and backfills any
52
+ missing collections via baseline inference. The merge is reported in
53
+ `metadata.reconciliation` with `llm_covered_collections`,
54
+ `snapshot_collections`, `backfilled_collections`, and `strategy`; a
55
+ user-visible warning is appended. No-op when the LLM's output is
56
+ already complete.
57
+
58
+ ### Quality
59
+
60
+ - **#4 discriminator hardening.** `_pick_best_type_field` now rejects
61
+ candidate type fields that look like identifiers (`*Id`, `*_id`,
62
+ `uuid`, etc.), carry too many distinct values
63
+ (`MAX_TYPE_FIELD_DISTINCT_VALUES=32`), or cover too little of the
64
+ collection (`MIN_TYPE_FIELD_COVERAGE_FRACTION=0.80`). Single-distinct-
65
+ value edge discriminators are still accepted under the
66
+ single-value-edge fallback. New tunables live in `defaults.py`.
67
+ - **#2 richer index flags.** `physicalMapping[...].indexes[*]` now
68
+ propagates `vci`, `deduplicate`, and `storedValues` from the raw
69
+ ArangoDB index metadata. Vertex-Centric Indexes are excluded from the
70
+ `indexed=True` heuristic on properties.
71
+
72
+ ### Documentation
73
+
74
+ - **#8 PRD §3.13.3 / §4.1 update.** The PRD now sanctions the two-
75
+ fingerprint model (shape vs counts), a four-state change-status
76
+ contract (`unchanged` / `stats_changed` / `shape_changed` /
77
+ `no_cache`), stats-only refresh as the product behavior for
78
+ `stats_changed`, storage-agnostic caching, and self-exclusion of
79
+ database-resident cache collections from the shape fingerprint.
80
+
81
+ ### CI / Release infrastructure
82
+
83
+ - Trusted-publisher GitHub Actions workflow (`publish.yml`) targeting
84
+ PyPI and TestPyPI via OIDC — no long-lived tokens.
85
+ - `sdist` allow-list tightened in `pyproject.toml` so source
86
+ distributions include only package code, licence, readme, changelog,
87
+ and the tool-contract JSON schemas.
88
+ - `schema_analyzer/py.typed` added so downstream type checkers pick up
89
+ the package's inline annotations.
90
+ - Ruff lint + format are enforced by CI; mypy runs in advisory mode.
91
+
92
+ ## 0.2.0 (never published)
93
+
94
+ `0.2.0` was bumped on `main` as the planned first PyPI release but the
95
+ tag was never cut — #7 and #8 landed before release and the scope was
96
+ rolled forward into `0.3.0`. Everything originally slated for `0.2.0`
97
+ is part of `0.3.0` above.
98
+
99
+ ## 0.1.0
100
+
101
+ ### Initial release
102
+
103
+ - Physical schema snapshotting with deterministic ordering and fingerprinting
104
+ - Conceptual schema inference (entities, relationships, properties)
105
+ - Physical mapping generation (COLLECTION, LABEL, DEDICATED_COLLECTION, GENERIC_WITH_TYPE)
106
+ - AQL fragment helpers (`aql_entity_match`, `aql_relationship_traversal`) with injection-safe bind parameters
107
+ - LLM-assisted analysis with generate → validate → repair loop
108
+ - Provider support: OpenAI, Anthropic, OpenRouter (pluggable registry)
109
+ - Deterministic baseline inference when no LLM is configured (graceful degradation)
110
+ - Filesystem caching keyed by schema fingerprint with configurable TTL
111
+ - Tool contract v1: stable JSON API (stdin/stdout) with request/response schema validation
112
+ - CLI: tool mode (stdin JSON) and eval subcommand
113
+ - Output formats: analysis JSON, snapshot, export (Cypher), Markdown docs, OWL Turtle
114
+ - Evaluation harness with 5 domain packs, physical schema generator, and F1/accuracy scoring
115
+ - Eval report comparison for tracking quality regressions
116
+
117
+ ### Quality improvements
118
+
119
+ - Centralized tunable defaults in `defaults.py` (LLM parameters, timeouts, confidence, cache)
120
+ - Unified `pascal_case()` utility replacing duplicate implementations
121
+ - Shared test helpers extracted into `conftest.py`
122
+ - Consolidated sync/async workflow via shared `_parse_and_validate()` helper
123
+ - Catch-all error handler in tool entrypoint for contract-shaped error responses
124
+ - Eliminated redundant snapshot work (tool passes pre-built snapshot to analyzer)
125
+ - TYPE_CHECKING guards for `StandardDatabase` imports
126
+ - Proper exception chaining (`raise ... from e`) across all providers and workflow
127
+ - Logging in tool.py (operation tracking) and cache.py (corrupt file warnings)
128
+ - CI: pip caching, integration tests trigger on PRs, coverage threshold at 65%
129
+ - Test coverage for: cache, docs, exports, OWL export, validation, providers, conceptual schema, CLI, tool happy paths
@@ -0,0 +1,97 @@
1
+ # Contributing
2
+
3
+ ## Development setup
4
+
5
+ ```bash
6
+ python -m venv .venv
7
+ source .venv/bin/activate
8
+ python -m pip install -U pip
9
+ python -m pip install -e ".[dev]"
10
+ ```
11
+
12
+ ## Running tests
13
+
14
+ ```bash
15
+ # Unit tests (default)
16
+ pytest -q
17
+
18
+ # With verbose output
19
+ pytest -v
20
+
21
+ # Integration tests (requires Docker ArangoDB on port 18529)
22
+ docker compose up -d
23
+ export RUN_INTEGRATION=1
24
+ export ARANGO_URL=http://localhost:18529
25
+ export ARANGO_DB=schema_analyzer_it
26
+ export ARANGO_USER=root
27
+ export ARANGO_PASS=openSesame
28
+ pytest -q -m integration
29
+ ```
30
+
31
+ ## Linting and formatting
32
+
33
+ ```bash
34
+ ruff check . # lint
35
+ ruff format --check . # format check
36
+ ruff format . # auto-format
37
+ mypy schema_analyzer/ # type checking
38
+ ```
39
+
40
+ ## Project structure
41
+
42
+ ```
43
+ schema_analyzer/
44
+ ├── analyzer.py # AgenticSchemaAnalyzer (main entry point)
45
+ ├── baseline.py # Deterministic inference (no LLM fallback)
46
+ ├── cache.py # Filesystem caching by schema fingerprint
47
+ ├── cli.py # CLI: tool mode + eval subcommand
48
+ ├── conceptual.py # ConceptualSchema dataclass
49
+ ├── defaults.py # Centralized tunable constants
50
+ ├── docs.py # Markdown documentation generator
51
+ ├── errors.py # SchemaAnalyzerError
52
+ ├── exports.py # Transpiler export (Cypher)
53
+ ├── mapping.py # PhysicalMapping with AQL helpers
54
+ ├── owl_export.py # OWL Turtle export
55
+ ├── snapshot.py # Physical schema introspection
56
+ ├── tool.py # Tool contract v1 entrypoint
57
+ ├── tool_contract_v1.py # JSON Schema validation
58
+ ├── types.py # Pydantic models (AnalysisMetadata, AnalysisResult)
59
+ ├── utils.py # Shared utilities (pascal_case, sha256, JSON extraction)
60
+ ├── validation.py # LLM output validation schema
61
+ ├── workflow.py # Generate → validate → repair loop
62
+ ├── eval/ # Evaluation harness
63
+ │ ├── domain_loader.py # Load domain specs from domains/
64
+ │ ├── generator.py # Physical schema generator (PG + LPG variants)
65
+ │ ├── runner.py # Eval orchestration and reporting
66
+ │ └── scoring.py # F1, domain/range, mapping style scoring
67
+ ├── providers/ # LLM provider implementations
68
+ │ ├── base.py # LLMProvider protocol
69
+ │ ├── openai_provider.py
70
+ │ ├── anthropic_provider.py
71
+ │ └── openrouter_provider.py
72
+ └── tool_contract/v1/ # Bundled JSON Schema files
73
+ ```
74
+
75
+ ## Guidelines
76
+
77
+ - Keep outputs deterministic (ordering, stable JSON).
78
+ - Do not log or persist secrets (API keys, credentials).
79
+ - Add/adjust tests for behavior changes (golden fixtures where appropriate).
80
+ - Use `defaults.py` for tunable constants — avoid scattering magic numbers.
81
+ - Use `pascal_case()` from `utils.py` — do not create local copies.
82
+ - Shared test helpers live in `tests/conftest.py` — prefer importing over duplicating.
83
+ - Provider implementations should use `raise ... from e` for proper exception chaining.
84
+ - All tool responses (including unexpected errors) must be contract-shaped JSON.
85
+
86
+ ## Adding a new LLM provider
87
+
88
+ 1. Create `schema_analyzer/providers/my_provider.py` implementing the `LLMProvider` protocol
89
+ 2. Register in `schema_analyzer/providers/__init__.py` `_REGISTRY`
90
+ 3. Use constants from `defaults.py` for temperature, max_tokens, etc.
91
+ 4. Wrap SDK errors as `SchemaAnalyzerError(code="PROVIDER_ERROR")` with `raise ... from e`
92
+
93
+ ## Adding a new domain pack
94
+
95
+ 1. Create `domains/<name>/domain.json` with entities and relationships
96
+ 2. The eval harness auto-discovers domains via `list_domains()`
97
+ 3. Run `arangodb-schema-analyzer eval --domains <name>` to test
@@ -0,0 +1,174 @@
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+
135
+ 6. Trademarks. This License does not grant permission to use the trade
136
+ names, trademarks, service marks, or product names of the Licensor,
137
+ except as required for reasonable and customary use in describing the
138
+ origin of the Work and reproducing the content of the NOTICE file.
139
+
140
+ 7. Disclaimer of Warranty. Unless required by applicable law or
141
+ agreed to in writing, Licensor provides the Work (and each
142
+ Contributor provides its Contributions) on an "AS IS" BASIS,
143
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
144
+ implied, including, without limitation, any warranties or conditions
145
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
146
+ PARTICULAR PURPOSE. You are solely responsible for determining the
147
+ appropriateness of using or redistributing the Work and assume any
148
+ risks associated with Your exercise of permissions under this License.
149
+
150
+ 8. Limitation of Liability. In no event and under no legal theory,
151
+ whether in tort (including negligence), contract, or otherwise,
152
+ unless required by applicable law (such as deliberate and grossly
153
+ negligent acts) or agreed to in writing, shall any Contributor be
154
+ liable to You for damages, including any direct, indirect, special,
155
+ incidental, or consequential damages of any character arising as a
156
+ result of this License or out of the use or inability to use the
157
+ Work (including but not limited to damages for loss of goodwill,
158
+ work stoppage, computer failure or malfunction, or any and all
159
+ other commercial damages or losses), even if such Contributor
160
+ has been advised of the possibility of such damages.
161
+
162
+ 9. Accepting Warranty or Additional Liability. While redistributing
163
+ the Work or Derivative Works thereof, You may choose to offer,
164
+ and charge a fee for, acceptance of support, warranty, indemnity,
165
+ or other liability obligations and/or rights consistent with this
166
+ License. However, in accepting such obligations, You may act only
167
+ on Your own behalf and on Your sole responsibility, not on behalf
168
+ of any other Contributor, and only if You agree to indemnify,
169
+ defend, and hold each Contributor harmless for any liability
170
+ incurred by, or claims asserted against, such Contributor by reason
171
+ of your accepting any such warranty or additional liability.
172
+
173
+ END OF TERMS AND CONDITIONS
174
+
@@ -0,0 +1,198 @@
1
+ Metadata-Version: 2.4
2
+ Name: arangodb-schema-analyzer
3
+ Version: 0.3.0
4
+ Summary: Agentic schema analyzer for ArangoDB: conceptual model + conceptual-to-physical mapping for transpilers.
5
+ Project-URL: Homepage, https://github.com/ArthurKeen/arango-schema-mapper
6
+ Project-URL: Repository, https://github.com/ArthurKeen/arango-schema-mapper
7
+ Project-URL: Issues, https://github.com/ArthurKeen/arango-schema-mapper/issues
8
+ Project-URL: Changelog, https://github.com/ArthurKeen/arango-schema-mapper/blob/main/CHANGELOG.md
9
+ Author: Arthur Keen
10
+ Maintainer: Arthur Keen
11
+ License-Expression: Apache-2.0
12
+ License-File: LICENSE
13
+ Keywords: arangodb,conceptual-model,cypher,graph,llm,mcp,ontology,owl,schema,sparql,transpiler
14
+ Classifier: Development Status :: 4 - Beta
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: Intended Audience :: Information Technology
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Programming Language :: Python
19
+ Classifier: Programming Language :: Python :: 3
20
+ Classifier: Programming Language :: Python :: 3 :: Only
21
+ Classifier: Programming Language :: Python :: 3.10
22
+ Classifier: Programming Language :: Python :: 3.11
23
+ Classifier: Programming Language :: Python :: 3.12
24
+ Classifier: Programming Language :: Python :: 3.13
25
+ Classifier: Topic :: Database
26
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
27
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
28
+ Classifier: Typing :: Typed
29
+ Requires-Python: >=3.10
30
+ Requires-Dist: jsonschema>=4.21.0
31
+ Requires-Dist: pydantic>=2.6.0
32
+ Requires-Dist: python-arango>=8.1.1
33
+ Provides-Extra: anthropic
34
+ Requires-Dist: anthropic>=0.25.0; extra == 'anthropic'
35
+ Provides-Extra: dev
36
+ Requires-Dist: mypy>=1.10.0; extra == 'dev'
37
+ Requires-Dist: pytest-cov>=5.0.0; extra == 'dev'
38
+ Requires-Dist: pytest>=8.0.0; extra == 'dev'
39
+ Requires-Dist: ruff>=0.4.0; extra == 'dev'
40
+ Provides-Extra: mcp
41
+ Requires-Dist: mcp>=1.2.0; extra == 'mcp'
42
+ Provides-Extra: openai
43
+ Requires-Dist: openai>=1.0.0; extra == 'openai'
44
+ Provides-Extra: openrouter
45
+ Description-Content-Type: text/markdown
46
+
47
+ # arangodb-schema-analyzer (v0.1)
48
+
49
+ Standalone Python library that analyzes an ArangoDB database's physical schema and produces:
50
+
51
+ - a **conceptual schema** (entities, relationships, properties)
52
+ - a **conceptual→physical mapping** suitable for transpilers (Cypher, SPARQL, future)
53
+ - **metadata** (confidence, timestamp, analyzed collection counts, detected patterns)
54
+
55
+ ## Install
56
+
57
+ From source (this repo):
58
+
59
+ ```bash
60
+ python -m pip install -e .
61
+ ```
62
+
63
+ Optional LLM provider extras:
64
+
65
+ ```bash
66
+ python -m pip install -e ".[openai]"
67
+ python -m pip install -e ".[anthropic]"
68
+ ```
69
+
70
+ OpenRouter is also supported and requires no extra SDK (uses stdlib `urllib`).
71
+
72
+ **MCP (Model Context Protocol)** — optional stdio server wrapping the v1 JSON tool contract:
73
+
74
+ ```bash
75
+ python -m pip install -e ".[mcp]"
76
+ arangodb-schema-analyzer-mcp
77
+ ```
78
+
79
+ If you don't install a provider SDK (or you don't provide an API key), analysis degrades gracefully to deterministic baseline inference.
80
+
81
+ ## Usage
82
+
83
+ ```python
84
+ from arango import ArangoClient
85
+
86
+ from schema_analyzer import AgenticSchemaAnalyzer
87
+
88
+ client = ArangoClient(hosts="http://localhost:8529")
89
+ db = client.db("mydb", username="root", password="openSesame")
90
+
91
+ analyzer = AgenticSchemaAnalyzer(
92
+ llm_provider="openai", # or "anthropic" or "openrouter"
93
+ api_key=None, # e.g. os.environ["OPENAI_API_KEY"]
94
+ model="gpt-4o-mini",
95
+ cache={"type": "filesystem", "directory": ".schema-analyzer-cache"},
96
+ )
97
+
98
+ analysis = analyzer.analyze_physical_schema(
99
+ db,
100
+ timeout_ms=60_000,
101
+ sample_limit_per_collection=5,
102
+ )
103
+
104
+ print(analysis.metadata.confidence)
105
+ ```
106
+
107
+ ## Tool usage (CLI)
108
+
109
+ This project can be called as a **non-interactive tool** (stdin JSON → stdout JSON) using the v1 contract under `docs/tool-contract/v1/`.
110
+
111
+ Install (editable):
112
+
113
+ ```bash
114
+ python -m pip install -e .
115
+ ```
116
+
117
+ Example (analyze) using the provided request example:
118
+
119
+ ```bash
120
+ cat docs/tool-contract/v1/examples/request.analyze.json | arangodb-schema-analyzer --pretty
121
+ ```
122
+
123
+ ### CLI options
124
+
125
+ ```
126
+ arangodb-schema-analyzer [--request FILE] [--out FILE] [--pretty] [-v]
127
+ ```
128
+
129
+ - `--request FILE` — path to request JSON (default: read from stdin)
130
+ - `--out FILE` — write response JSON to file (default: stdout)
131
+ - `--pretty` — pretty-print JSON output
132
+ - `-v` — enable verbose logging
133
+
134
+ ## Evaluation CLI
135
+
136
+ Run analysis quality benchmarks against domain packs:
137
+
138
+ ```bash
139
+ arangodb-schema-analyzer eval \
140
+ --provider openai \
141
+ --model gpt-4o-mini \
142
+ --report eval_report.json \
143
+ --baseline eval_baseline.json
144
+ ```
145
+
146
+ Options: `--url`, `--user`, `--password`, `--database`, `--domains`, `--sample-limit`, `--timeout-ms`, `--scale`, `--no-cleanup`.
147
+
148
+ Domains included: `healthcare`, `financial_fraud_detection`, `insurance`, `intelligence`, `network_asset_management`.
149
+
150
+ ## Public API
151
+
152
+ Exports:
153
+
154
+ - `AgenticSchemaAnalyzer` — main analyzer class
155
+ - `ConceptualSchema` — conceptual schema dataclass
156
+ - `PhysicalMapping` — physical mapping dataclass with AQL helpers
157
+ - `generate_schema_docs(analysis)` — Markdown documentation generator
158
+ - `export_mapping(analysis, target)` — transpiler export (v0.1: `cypher`)
159
+ - `export_conceptual_model_as_owl_turtle(analysis)` — OWL Turtle export
160
+ - `register_provider(name, ...)` — register custom LLM providers
161
+ - `list_providers()` — list registered LLM provider names
162
+
163
+ ## Configuration
164
+
165
+ Tunable defaults live in `schema_analyzer/defaults.py`. Key parameters:
166
+
167
+ | Parameter | Default | Description |
168
+ |---|---|---|
169
+ | `MAX_REPAIR_ATTEMPTS` | 2 | LLM repair loop iterations |
170
+ | `LLM_TEMPERATURE` | 0.0 | Sampling temperature |
171
+ | `DEFAULT_TIMEOUT_MS` | 60000 | Analysis timeout (ms) |
172
+ | `DEFAULT_REVIEW_THRESHOLD` | 0.6 | Confidence threshold for `review_required` |
173
+ | `DEFAULT_CACHE_TTL_SECONDS` | 86400 | Cache TTL (seconds) |
174
+
175
+ ## Notes
176
+
177
+ - **Secrets**: API keys are read from config/env; never persisted by this library.
178
+ - **AQL fragments**: helper methods return AQL text + bind variables; collection names are passed via bind parameters.
179
+ - **Graceful degradation**: without an LLM provider, the analyzer returns deterministic baseline inference with `review_required=True`.
180
+
181
+ ## Integration evaluation (Docker ArangoDB)
182
+
183
+ Bring up a local ArangoDB:
184
+
185
+ ```bash
186
+ docker compose up -d
187
+ ```
188
+
189
+ Run integration tests (opt-in):
190
+
191
+ ```bash
192
+ export RUN_INTEGRATION=1
193
+ export ARANGO_URL=http://localhost:18529
194
+ export ARANGO_DB=schema_analyzer_it
195
+ export ARANGO_USER=root
196
+ export ARANGO_PASS=openSesame
197
+ pytest -q -m integration
198
+ ```