tkm-graphforge 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. tkm_graphforge-0.1.0/.github/FUNDING.yml +1 -0
  2. tkm_graphforge-0.1.0/.github/workflows/ci.yml +43 -0
  3. tkm_graphforge-0.1.0/.pre-commit-config.yaml +18 -0
  4. tkm_graphforge-0.1.0/AGENTS.md +83 -0
  5. tkm_graphforge-0.1.0/CHANGELOG.md +80 -0
  6. tkm_graphforge-0.1.0/CLAUDE.md +2 -0
  7. tkm_graphforge-0.1.0/CONTRIBUTING.md +13 -0
  8. tkm_graphforge-0.1.0/EVOLUTION.md +157 -0
  9. tkm_graphforge-0.1.0/LICENSE +21 -0
  10. tkm_graphforge-0.1.0/Makefile +45 -0
  11. tkm_graphforge-0.1.0/PKG-INFO +147 -0
  12. tkm_graphforge-0.1.0/README.md +123 -0
  13. tkm_graphforge-0.1.0/domains/science.yaml +51 -0
  14. tkm_graphforge-0.1.0/domains/social.yaml +43 -0
  15. tkm_graphforge-0.1.0/domains/technology.yaml +51 -0
  16. tkm_graphforge-0.1.0/examples/01_technology_graph.py +95 -0
  17. tkm_graphforge-0.1.0/examples/02_text_extraction.py +79 -0
  18. tkm_graphforge-0.1.0/examples/03_async_parallel_enrichment.py +84 -0
  19. tkm_graphforge-0.1.0/graphforge/__init__.py +9 -0
  20. tkm_graphforge-0.1.0/graphforge/async_builder.py +173 -0
  21. tkm_graphforge-0.1.0/graphforge/builder.py +193 -0
  22. tkm_graphforge-0.1.0/graphforge/domains.py +106 -0
  23. tkm_graphforge-0.1.0/graphforge/enricher.py +134 -0
  24. tkm_graphforge-0.1.0/graphforge/extractor.py +206 -0
  25. tkm_graphforge-0.1.0/graphforge/models.py +45 -0
  26. tkm_graphforge-0.1.0/pyproject.toml +54 -0
  27. tkm_graphforge-0.1.0/tests/__init__.py +0 -0
  28. tkm_graphforge-0.1.0/tests/conftest.py +110 -0
  29. tkm_graphforge-0.1.0/tests/test_builder.py +81 -0
  30. tkm_graphforge-0.1.0/tests/test_builder_extended.py +90 -0
  31. tkm_graphforge-0.1.0/tests/test_domains.py +65 -0
  32. tkm_graphforge-0.1.0/tests/test_domains_extended.py +83 -0
  33. tkm_graphforge-0.1.0/tests/test_enricher.py +84 -0
  34. tkm_graphforge-0.1.0/tests/test_enricher_extended.py +99 -0
  35. tkm_graphforge-0.1.0/tests/test_error_hardening.py +171 -0
  36. tkm_graphforge-0.1.0/tests/test_extractor.py +77 -0
  37. tkm_graphforge-0.1.0/tests/test_extractor_extended.py +110 -0
  38. tkm_graphforge-0.1.0/tests/test_models.py +50 -0
  39. tkm_graphforge-0.1.0/tests/test_models_extended.py +73 -0
  40. tkm_graphforge-0.1.0/tests/test_performance.py +213 -0
  41. tkm_graphforge-0.1.0/tests/test_property_based.py +232 -0
  42. tkm_graphforge-0.1.0/tests/test_security.py +142 -0
@@ -0,0 +1 @@
1
+ github: TECHKNOWMAD-LABS
@@ -0,0 +1,43 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: ["main", "develop", "feature/**"]
6
+ pull_request:
7
+ branches: ["main"]
8
+
9
+ jobs:
10
+ test:
11
+ name: Lint + Test (Python ${{ matrix.python-version }})
12
+ runs-on: ubuntu-latest
13
+ strategy:
14
+ fail-fast: false
15
+ matrix:
16
+ python-version: ["3.12"]
17
+
18
+ steps:
19
+ - uses: actions/checkout@v4
20
+
21
+ - name: Set up Python ${{ matrix.python-version }}
22
+ uses: actions/setup-python@v5
23
+ with:
24
+ python-version: ${{ matrix.python-version }}
25
+
26
+ - name: Install uv
27
+ run: pip install uv
28
+
29
+ - name: Install dependencies
30
+ run: uv sync --all-extras
31
+
32
+ - name: Lint with ruff
33
+ run: uv run ruff check graphforge/ tests/
34
+
35
+ - name: Run tests with coverage
36
+ run: uv run pytest -v --tb=short --cov=graphforge --cov-report=term-missing --cov-fail-under=95
37
+
38
+ - name: Upload coverage report
39
+ if: always()
40
+ uses: actions/upload-artifact@v4
41
+ with:
42
+ name: coverage-report-${{ matrix.python-version }}
43
+ path: .coverage
@@ -0,0 +1,18 @@
1
+ repos:
2
+ - repo: https://github.com/astral-sh/ruff-pre-commit
3
+ rev: v0.4.4
4
+ hooks:
5
+ - id: ruff
6
+ args: [--fix]
7
+ - id: ruff-format
8
+
9
+ - repo: https://github.com/pre-commit/pre-commit-hooks
10
+ rev: v4.6.0
11
+ hooks:
12
+ - id: trailing-whitespace
13
+ - id: end-of-file-fixer
14
+ - id: check-yaml
15
+ - id: check-added-large-files
16
+ args: ["--maxkb=500"]
17
+ - id: detect-private-key
18
+ - id: check-merge-conflict
@@ -0,0 +1,83 @@
1
+ # AGENTS.md — Edgecraft Autonomous Development Protocol
2
+
3
+ This repository was developed using the **Edgecraft Protocol**, an autonomous
4
+ multi-cycle development system that iterates through structured improvement layers.
5
+
6
+ ## Protocol Overview
7
+
8
+ Edgecraft operates through 8 prescribed cycles, each targeting a specific quality
9
+ dimension. All cycles execute without human intervention; the agent fixes failures
10
+ and continues.
11
+
12
+ ### Cycle Structure
13
+
14
+ | Cycle | Layer | Focus | Commit Prefix |
15
+ |-------|-------|-------|---------------|
16
+ | 1 | L1/detection → L5/action → L6/grounding | Test coverage | `L1/detection:`, `L5/action:`, `L6/grounding:` |
17
+ | 2 | L3/sub-noise → L5/action | Error hardening | `L3/sub-noise:`, `L5/action:` |
18
+ | 3 | L4/conjecture → L6/grounding → L7/flywheel | Performance | `L4/conjecture:`, `L6/grounding:`, `L7/flywheel:` |
19
+ | 4 | L2/noise → L5/action | Security | `L2/noise:`, `L5/action:` |
20
+ | 5 | L5/action | CI/CD | `L5/action:` |
21
+ | 6 | L3/sub-noise → L6/grounding | Property-based testing | `L3/sub-noise:`, `L6/grounding:` |
22
+ | 7 | L5/action | Examples + documentation | `L5/action:` |
23
+ | 8 | L5/action | Release engineering | `L5/action:` |
24
+
25
+ ### Layer Semantics
26
+
27
+ - **L1/detection** — Identify what is missing or broken.
28
+ - **L2/noise** — Surface-level scan results (security, lint).
29
+ - **L3/sub-noise** — Subtle bugs, edge cases, property violations.
30
+ - **L4/conjecture** — Hypothesis about improvement potential.
31
+ - **L5/action** — Concrete implementation of the fix/feature.
32
+ - **L6/grounding** — Measured validation of the hypothesis.
33
+ - **L7/flywheel** — Pattern recognition for cross-repo applicability.
34
+
35
+ ## What the Agent Does in Each Cycle
36
+
37
+ ### Cycle 1 — Test Coverage
38
+ 1. Run `pytest --cov` to find uncovered lines.
39
+ 2. Write `conftest.py` with shared fixtures.
40
+ 3. Write test files targeting every uncovered branch.
41
+ 4. Fix any failing tests before committing.
42
+
43
+ ### Cycle 2 — Error Hardening
44
+ 1. Attempt to break the code with: `None`, empty strings, wrong types,
45
+ malformed data, huge inputs, unicode.
46
+ 2. Add input validation, graceful fallbacks, and type guards.
47
+
48
+ ### Cycle 3 — Performance
49
+ 1. Find sequential I/O-bound operations.
50
+ 2. Parallelise with `asyncio.gather` + semaphore.
51
+ 3. Measure and log actual speedup.
52
+
53
+ ### Cycle 4 — Security
54
+ 1. Scan for hardcoded secrets using 7+ patterns.
55
+ 2. Check for injection vectors (path traversal, SQL, command).
56
+ 3. Fix all real findings.
57
+
58
+ ### Cycle 5 — CI/CD
59
+ 1. Create GitHub Actions workflow with lint + test.
60
+ 2. Create `.pre-commit-config.yaml` with ruff + hooks.
61
+
62
+ ### Cycle 6 — Property-Based Testing
63
+ 1. Write Hypothesis tests for core invariants.
64
+ 2. If Hypothesis finds failures, fix the underlying code first.
65
+
66
+ ### Cycle 7 — Examples + Docs
67
+ 1. Create 2-3 working example scripts in `examples/`.
68
+ 2. Test each example manually.
69
+ 3. Add docstrings to all public functions.
70
+
71
+ ### Cycle 8 — Release Engineering
72
+ 1. Finalise `pyproject.toml` metadata.
73
+ 2. Write `CHANGELOG.md`.
74
+ 3. Create `Makefile`, `AGENTS.md`, `EVOLUTION.md`.
75
+ 4. Tag `v0.1.0`.
76
+
77
+ ## Absolute Rules
78
+
79
+ - Never ask questions. Never pause.
80
+ - Fix all test failures before committing.
81
+ - Every commit must have a meaningful diff.
82
+ - Push after each cycle.
83
+ - All commit messages start with the Edgecraft layer prefix.
@@ -0,0 +1,80 @@
1
+ # Changelog
2
+
3
+ All notable changes to GraphForge are documented in this file.
4
+
5
+ ## [0.1.0] — 2026-03-23
6
+
7
+ ### Summary
8
+ First release following 8 autonomous Edgecraft iteration cycles.
9
+
10
+ ### Cycle 1 — Test Coverage
11
+ - Added `tests/conftest.py` with shared fixtures (`make_entity`, `make_relationship`,
12
+ `triangle_builder`, `tech_extractor`).
13
+ - Added extended test files for all 5 source modules (builder, enricher, extractor,
14
+ domains, models), targeting previously uncovered branches.
15
+ - Fixed `ModuleNotFoundError` for `numpy` and `scipy` (added to project dependencies).
16
+ - Coverage improved from 89% to **100%** across all 6 source modules.
17
+ - Total tests: 92 passing.
18
+
19
+ ### Cycle 2 — Error Hardening
20
+ - `GraphExtractor.from_dict`: `None` returns `([], [])`, `TypeError` for non-list,
21
+ `ValueError` for >100k records, non-dict entries silently skipped.
22
+ - `GraphExtractor.from_text`: `None`/empty/whitespace returns `([], [])`, `bytes` are
23
+ decoded to UTF-8, `ValueError` guard for >1M chars.
24
+ - `GraphExtractor.validate`: `None` inputs treated as empty lists.
25
+ - `GraphBuilder.add_entity/add_relationship`: `TypeError` on wrong type.
26
+ - `GraphBuilder.add_entities/add_relationships`: `None` is a no-op.
27
+ - `GraphBuilder.get_node/get_neighbors/get_predecessors/find_by_type`: all `None`-safe.
28
+ - `GraphEnricher.add_node_property`: `None` node_id returns `False`; empty key raises.
29
+ - `GraphEnricher.bulk_enrich_nodes`: `None`/empty dict returns `[]` early.
30
+ - Tests: 27 new hardening tests. Total: 119 passing.
31
+
32
+ ### Cycle 3 — Performance
33
+ - Added `graphforge/async_builder.py` with:
34
+ - `enrich_nodes_parallel`: `asyncio.gather` with semaphore for parallel node enrichment.
35
+ - `build_graph_parallel`: parallel record-batch parsing via `run_in_executor`.
36
+ - `measure_sequential_vs_parallel`: benchmark utility.
37
+ - `GraphBuilder._version`: cache-invalidation counter bumped on every structural change.
38
+ - Measured **30.4x speedup** for I/O-bound enrichment (30 nodes, 10ms each).
39
+ - Tests: 13 new performance tests. Total: 132 passing.
40
+
41
+ ### Cycle 4 — Security
42
+ - Security scan: 0 real findings across 6 source files, 7 secret patterns.
43
+ - Fixed **CWE-22 path traversal** in `DomainLoader.load`: domain name validated
44
+ against `/`, `\\`, `..`; `Path.resolve()` + `relative_to()` ensures path stays
45
+ within domains directory.
46
+ - Tests: 11 new security tests. Total: 143 passing.
47
+
48
+ ### Cycle 5 — CI/CD
49
+ - Added `.github/workflows/ci.yml`: checkout, Python 3.12, uv sync, ruff check,
50
+ pytest with `--cov-fail-under=95`, coverage artifact upload.
51
+ - Added `.pre-commit-config.yaml`: ruff + ruff-format, trailing-whitespace,
52
+ end-of-file-fixer, check-yaml, detect-private-key, check-merge-conflict.
53
+ - Applied `ruff --fix` to all source and test files (22 auto-fixed lint issues).
54
+
55
+ ### Cycle 6 — Property-Based Testing
56
+ - Added `tests/test_property_based.py` with 11 Hypothesis property tests:
57
+ 1. Serialisation round-trips preserve node/edge counts.
58
+ 2. Entity/Relationship construction stable on any valid strings.
59
+ 3. `validate()` never crashes on any entity/rel combination.
60
+ 4. `from_dict()` output counts bounded by input record count.
61
+ 5. `build()` node count equals unique entity count.
62
+ 6. `from_text()` never crashes on any string ≤500 chars.
63
+ 7. Entity hash/equality contract holds for all distinct IDs.
64
+ - Hypothesis found **no failures** across all strategies.
65
+
66
+ ### Cycle 7 — Examples + Docs
67
+ - Added `examples/01_technology_graph.py`: full workflow example.
68
+ - Added `examples/02_text_extraction.py`: regex NER + graph enrichment.
69
+ - Added `examples/03_async_parallel_enrichment.py`: parallel batch + async enrichment.
70
+ - Fixed bug: `weight` key now excluded from `Relationship.properties` in `from_dict`
71
+ (prevented `TypeError: multiple values for keyword argument 'weight'`).
72
+ - Added missing docstrings to `GraphBuilder` public methods.
73
+
74
+ ### Cycle 8 — Release Engineering
75
+ - Updated `pyproject.toml`: author, readme, keywords, classifiers.
76
+ - Added `CHANGELOG.md` (this file).
77
+ - Added `Makefile` with `test`, `lint`, `format`, `security`, `clean` targets.
78
+ - Added `AGENTS.md` documenting the Edgecraft autonomous development protocol.
79
+ - Added `EVOLUTION.md` with per-cycle timestamps and findings.
80
+ - Tagged `v0.1.0`.
@@ -0,0 +1,2 @@
1
+ - Production quality. Run tests. Fix failures.
2
+ - Python 3.12 Node 22. Type hints. Env vars for secrets.
@@ -0,0 +1,13 @@
1
+ # Contributing to this project
2
+
3
+ 1. Fork this repository
4
+ 2. Create a feature branch (`git checkout -b feat/your-feature`)
5
+ 3. Write tests for your changes
6
+ 4. Ensure all tests pass (`pytest -v` or `npm test`)
7
+ 5. Ensure linter passes (`ruff check .` for Python)
8
+ 6. Commit with a descriptive message
9
+ 7. Open a Pull Request
10
+
11
+ By contributing, you agree that your contributions will be licensed under the MIT License.
12
+
13
+ Built by [TechKnowMad Labs](https://techknowmad.ai)
@@ -0,0 +1,157 @@
1
+ # EVOLUTION.md — Edgecraft 8-Cycle Autonomous Development Log
2
+
3
+ Repository: `TECHKNOWMAD-LABS/graph-forge`
4
+ Protocol: Edgecraft v4.0
5
+ Date: 2026-03-23
6
+ Agent: Claude Sonnet 4.6
7
+
8
+ ---
9
+
10
+ ## Cycle 1 — Test Coverage
11
+ **Timestamp**: 2026-03-23T00:00
12
+
13
+ ### Findings
14
+ - `graphforge/builder.py` lines 85, 89-91, 110, 116 at 0% coverage.
15
+ - `graphforge/enricher.py` lines 65, 89-92, 98-105, 119 at 0%.
16
+ - `graphforge/models.py` lines 21, 40 (`NotImplemented` branches) at 0%.
17
+ - Missing `numpy` + `scipy` deps caused `test_enrich_pagerank` to fail.
18
+
19
+ ### Actions
20
+ - Added `tests/conftest.py` with `make_entity`, `make_relationship` factories
21
+ and 5 shared fixtures.
22
+ - Added 5 extended test files (50 new tests) covering every previously uncovered branch.
23
+ - Added `numpy>=2.4.3` and `scipy>=1.17.1` to project dependencies.
24
+
25
+ ### Result
26
+ - **92 tests passing** | **100% coverage** across all 6 source modules.
27
+
28
+ ---
29
+
30
+ ## Cycle 2 — Error Hardening
31
+ **Timestamp**: 2026-03-23T00:15
32
+
33
+ ### Findings
34
+ - `from_dict(None)` → `AttributeError` on `.items()`.
35
+ - `from_text(b"bytes")` → `TypeError` (bytes not str).
36
+ - `add_entity("string")` → `AttributeError` on `.id`.
37
+ - `bulk_enrich_nodes({})` iterated unnecessarily.
38
+ - `find_by_type("")` matched nodes with empty `entity_type` attr.
39
+ - `from_dict([None, 42, "str"])` → `AttributeError` on `.get()`.
40
+
41
+ ### Actions
42
+ - Added `None`/type guards to `from_dict`, `from_text`, `validate`.
43
+ - Added `TypeError` guards to `add_entity`, `add_relationship`.
44
+ - Added `None`-safe returns to `get_node`, `get_neighbors`, `get_predecessors`.
45
+ - Added early-exit for empty inputs in `bulk_enrich_nodes`.
46
+ - Bytes auto-decoded to UTF-8 in `from_text`.
47
+
48
+ ### Result
49
+ - **119 tests passing** | All hardening tests pass.
50
+
51
+ ---
52
+
53
+ ## Cycle 3 — Performance
54
+ **Timestamp**: 2026-03-23T00:30
55
+
56
+ ### Conjecture
57
+ Parallelising N I/O-bound node enrichment calls will yield ~Nx speedup.
58
+
59
+ ### Actions
60
+ - Added `graphforge/async_builder.py`:
61
+ - `enrich_nodes_parallel(builder, enricher_fn, *, concurrency=16)`
62
+ - `build_graph_parallel(record_batches, *, concurrency=16)`
63
+ - `measure_sequential_vs_parallel()` benchmark utility.
64
+ - Added `_version` counter to `GraphBuilder` for cache-invalidation support.
65
+
66
+ ### Result (measured on test machine)
67
+ - **Sequential**: 0.331s (30 nodes × 10ms I/O)
68
+ - **Parallel**: 0.011s (concurrency=30)
69
+ - **Speedup**: **30.4x**
70
+ - Pattern applicable to: `tkm-enhance`, `cortex-research-suite` enrichment pipelines.
71
+
72
+ ---
73
+
74
+ ## Cycle 4 — Security
75
+ **Timestamp**: 2026-03-23T00:45
76
+
77
+ ### Scan Results
78
+ - Files scanned: 6 Python source files.
79
+ - Patterns checked: AWS AKIA, GitHub PATs (ghp_, ghs_), OpenAI (sk-), SSH/RSA private keys,
80
+ generic `password=` assignments.
81
+ - **Real findings: 0**
82
+ - False positives: 1 (`_MAX_TEXT_LENGTH = 1_000_000` matched a broad numeric pattern).
83
+
84
+ ### Actions
85
+ - Fixed **CWE-22 path traversal** in `DomainLoader.load`:
86
+ - Reject domain names containing `/`, `\\`, or `..`.
87
+ - `Path.resolve()` + `relative_to()` verifies path stays within `domains_dir`.
88
+
89
+ ---
90
+
91
+ ## Cycle 5 — CI/CD
92
+ **Timestamp**: 2026-03-23T01:00
93
+
94
+ ### Actions
95
+ - `.github/workflows/ci.yml`: Python 3.12, uv, ruff check, pytest 95% coverage gate.
96
+ - `.pre-commit-config.yaml`: ruff + ruff-format, trailing-whitespace, check-yaml,
97
+ detect-private-key, check-merge-conflict.
98
+ - Applied `ruff --fix`: 22 auto-fixed issues (unused imports, f-string prefix, import order).
99
+
100
+ ---
101
+
102
+ ## Cycle 6 — Property-Based Testing
103
+ **Timestamp**: 2026-03-23T01:15
104
+
105
+ ### Invariants Tested
106
+ 1. `to_dict()` → `from_dict()` preserves node and edge counts.
107
+ 2. `Entity`/`Relationship` construction stable for all valid strings.
108
+ 3. `validate()` never raises for any entity/rel combination.
109
+ 4. `from_dict()` output bounded by input record count.
110
+ 5. `build()` node_count equals unique entity count.
111
+ 6. `from_text()` never crashes for any string ≤500 chars.
112
+ 7. Entity hash/equality contract for all distinct IDs.
113
+
114
+ ### Hypothesis Results
115
+ - **No failures found** across 11 property tests and 7 strategies.
116
+ - Total Hypothesis examples run: ~870.
117
+
118
+ ---
119
+
120
+ ## Cycle 7 — Examples + Docs
121
+ **Timestamp**: 2026-03-23T01:30
122
+
123
+ ### Actions
124
+ - `examples/01_technology_graph.py` — domain load → extract → build → query → serialise.
125
+ - `examples/02_text_extraction.py` — regex NER, PageRank, community detection.
126
+ - `examples/03_async_parallel_enrichment.py` — parallel batch + async enrichment (17x speedup).
127
+ - **Bug fixed**: `weight` key excluded from `Relationship.properties` in `from_dict`
128
+ (prevented `TypeError: multiple values for keyword argument 'weight'`).
129
+ - Added docstrings to all undocumented public methods in `GraphBuilder`.
130
+
131
+ ---
132
+
133
+ ## Cycle 8 — Release Engineering
134
+ **Timestamp**: 2026-03-23T01:45
135
+
136
+ ### Actions
137
+ - Updated `pyproject.toml`: author, readme, keywords, PyPI classifiers.
138
+ - Created `CHANGELOG.md` with all cycle improvements.
139
+ - Created `Makefile` with `test`, `lint`, `format`, `security`, `clean` targets.
140
+ - Created `AGENTS.md` documenting the Edgecraft protocol.
141
+ - Created `EVOLUTION.md` (this file).
142
+ - Tagged `v0.1.0`.
143
+
144
+ ---
145
+
146
+ ## Final State
147
+
148
+ | Metric | Value |
149
+ |--------|-------|
150
+ | Total tests | 154 |
151
+ | Coverage | 100% |
152
+ | Cycles completed | 8 |
153
+ | Security findings | 0 |
154
+ | Property strategies | 7 |
155
+ | Examples | 3 |
156
+ | Max speedup measured | 30.4x |
157
+ | Commits | ~16 |
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 TechKnowMad Labs Private Limited
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,45 @@
1
+ .PHONY: test lint format security clean install help
2
+
3
+ # Default target
4
+ help:
5
+ @echo "GraphForge — available make targets:"
6
+ @echo " make install Install all dependencies (uv sync)"
7
+ @echo " make test Run full test suite with coverage"
8
+ @echo " make lint Run ruff linter"
9
+ @echo " make format Run ruff formatter"
10
+ @echo " make security Run secret scan on source files"
11
+ @echo " make clean Remove build artifacts and caches"
12
+
13
+ install:
14
+ uv sync --all-extras
15
+
16
+ test:
17
+ uv run pytest -v --tb=short --cov=graphforge --cov-report=term-missing --cov-fail-under=95
18
+
19
+ test-fast:
20
+ uv run pytest -q --tb=line
21
+
22
+ test-property:
23
+ uv run pytest tests/test_property_based.py -v --tb=short
24
+
25
+ lint:
26
+ uv run ruff check graphforge/ tests/
27
+
28
+ format:
29
+ uv run ruff format graphforge/ tests/
30
+ uv run ruff check --fix graphforge/ tests/
31
+
32
+ security:
33
+ @echo "Running secret scan..."
34
+ @python3 -c "\
35
+ import re, pathlib; \
36
+ patterns = [r'AKIA[0-9A-Z]{16}', r'ghp_[A-Za-z0-9]{36}', r'sk-[A-Za-z0-9]{20,}', r'-----BEGIN.*PRIVATE KEY']; \
37
+ files = list(pathlib.Path('graphforge').rglob('*.py')); \
38
+ findings = []; \
39
+ [findings.extend([f'{f}:{i+1}' for i,l in enumerate(f.read_text().splitlines()) if any(re.search(p,l) for p in patterns)]) for f in files]; \
40
+ print(f'Scanned {len(files)} files — {len(findings)} findings') if not findings else print('FINDINGS:', findings)"
41
+
42
+ clean:
43
+ find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
44
+ find . -type f -name "*.pyc" -delete 2>/dev/null || true
45
+ rm -rf .coverage htmlcov dist build *.egg-info .pytest_cache .ruff_cache 2>/dev/null || true
@@ -0,0 +1,147 @@
1
+ Metadata-Version: 2.4
2
+ Name: tkm-graphforge
3
+ Version: 0.1.0
4
+ Summary: Knowledge graph builder with extractor, builder, and enricher components
5
+ Author-email: TechKnowMad Labs <admin@techknowmad.ai>
6
+ License: MIT
7
+ License-File: LICENSE
8
+ Keywords: extraction,graph,knowledge-graph,networkx,nlp
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
15
+ Requires-Python: >=3.12
16
+ Requires-Dist: networkx>=3.3
17
+ Requires-Dist: numpy>=2.4.3
18
+ Requires-Dist: pyyaml>=6.0
19
+ Requires-Dist: scipy>=1.17.1
20
+ Provides-Extra: dev
21
+ Requires-Dist: pytest>=8.0; extra == 'dev'
22
+ Requires-Dist: ruff>=0.4; extra == 'dev'
23
+ Description-Content-Type: text/markdown
24
+
25
+ # GraphForge
26
+
27
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
28
+ [![Python 3.12](https://img.shields.io/badge/python-3.12-blue.svg)](https://www.python.org/downloads/)
29
+ [![Tests](https://img.shields.io/badge/tests-passing-brightgreen.svg)](#quick-start)
30
+
31
+ Knowledge graph construction toolkit — extract entities and relationships from structured records or free text, build queryable directed graphs, and enrich them with network metrics.
32
+
33
+ ---
34
+
35
+ ## Features
36
+
37
+ - **Dual-mode extraction** — parse entities and relationships from dict records or unstructured text via configurable regex patterns
38
+ - **Domain configuration** — define entity types, relationship types, and validation rules in YAML; swap domains without touching code
39
+ - **Graph querying** — find nodes by type, compute shortest paths, list neighbors/predecessors, and extract subgraphs
40
+ - **Network enrichment** — compute PageRank, degree centrality, clustering coefficient, and normalize edge weights in one call
41
+ - **Community detection** — partition graphs using greedy modularity optimization (NetworkX)
42
+ - **Portable serialization** — round-trip graphs to/from plain dicts via node-link format
43
+
44
+ ---
45
+
46
+ ## Quick Start
47
+
48
+ ```bash
49
+ pip install graph-forge
50
+ ```
51
+
52
+ ```python
53
+ from graphforge import GraphBuilder, GraphExtractor, GraphEnricher
54
+ from graphforge.models import Entity, Relationship
55
+
56
+ # Build a graph manually
57
+ builder = GraphBuilder()
58
+ alice = Entity(id="alice", type="person", properties={"name": "Alice"})
59
+ bob = Entity(id="bob", type="person", properties={"name": "Bob"})
60
+ rel = Relationship(source="alice", target="bob", type="knows", weight=1.0)
61
+
62
+ builder.add_entity(alice)
63
+ builder.add_entity(bob)
64
+ builder.add_relationship(rel)
65
+
66
+ # Query
67
+ print(builder.get_neighbors("alice")) # ['bob']
68
+ print(builder.get_shortest_path("alice", "bob"))
69
+
70
+ # Extract from records
71
+ extractor = GraphExtractor()
72
+ records = [{"id": "p1", "type": "paper", "cites": "p2"}]
73
+ entities, relationships = extractor.extract_from_records(records)
74
+
75
+ # Enrich with metrics
76
+ enricher = GraphEnricher(builder.graph)
77
+ enricher.compute_centrality()
78
+ enricher.compute_pagerank()
79
+ enricher.detect_communities()
80
+ ```
81
+
82
+ ---
83
+
84
+ ## Architecture
85
+
86
+ ```
87
+ graph-forge/
88
+ ├── graphforge/
89
+ │ ├── models.py # Entity and Relationship dataclasses
90
+ │ ├── domains.py # DomainLoader — reads YAML domain configs
91
+ │ ├── builder.py # GraphBuilder — constructs and queries DiGraph
92
+ │ ├── extractor.py # GraphExtractor — parses records and free text
93
+ │ └── enricher.py # GraphEnricher — computes network metrics
94
+ ├── domains/
95
+ │ ├── technology.yaml
96
+ │ ├── science.yaml
97
+ │ └── social.yaml
98
+ └── tests/ # pytest suite, one file per module
99
+ ```
100
+
101
+ **Data flow:**
102
+
103
+ ```
104
+ Raw data (dicts / text)
105
+
106
+ GraphExtractor ← domain YAML controls entity/rel types
107
+
108
+ GraphBuilder ← NetworkX DiGraph under the hood
109
+
110
+ GraphEnricher ← PageRank, centrality, communities
111
+
112
+ Serialized dict / downstream query
113
+ ```
114
+
115
+ ---
116
+
117
+ ## Development
118
+
119
+ ```bash
120
+ git clone https://github.com/techknowmad/graph-forge.git
121
+ cd graph-forge
122
+ pip install -e ".[dev]"
123
+
124
+ # Lint
125
+ ruff check .
126
+
127
+ # Test
128
+ pytest -v
129
+ ```
130
+
131
+ All tests must pass and `ruff check` must be clean before opening a PR.
132
+
133
+ ---
134
+
135
+ ## Contributing
136
+
137
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for branch conventions, commit style, and the PR checklist.
138
+
139
+ ---
140
+
141
+ ## License
142
+
143
+ [MIT](LICENSE)
144
+
145
+ ---
146
+
147
+ <sub>Built by [TechKnowMad Labs](https://techknowmad.ai)</sub>