orga 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orga-0.1.0/.gitignore +161 -0
- orga-0.1.0/Dockerfile +28 -0
- orga-0.1.0/LICENSE +21 -0
- orga-0.1.0/PKG-INFO +187 -0
- orga-0.1.0/README.md +122 -0
- orga-0.1.0/design/address-quality-notes.org +30 -0
- orga-0.1.0/design/architecture-overview.org +44 -0
- orga-0.1.0/design/demo-walkthrough.org +75 -0
- orga-0.1.0/design/engineering-boundaries.org +46 -0
- orga-0.1.0/design/known-limitations.org +18 -0
- orga-0.1.0/design/lightweight-model-augmentation-plan.org +34 -0
- orga-0.1.0/design/m7_1_acceptance_report.org +30 -0
- orga-0.1.0/design/problem-family-guidelines.org +43 -0
- orga-0.1.0/design/result-interpretation.org +37 -0
- orga-0.1.0/docker-compose.yml +28 -0
- orga-0.1.0/docs/known-issues.org +28 -0
- orga-0.1.0/docs/service-demo.org +91 -0
- orga-0.1.0/examples/demo_commands.sh +61 -0
- orga-0.1.0/examples/extractor_service/__init__.py +0 -0
- orga-0.1.0/examples/extractor_service/main.py +49 -0
- orga-0.1.0/examples/fixtures/batch_urls.txt +26 -0
- orga-0.1.0/examples/job_service/__init__.py +0 -0
- orga-0.1.0/examples/job_service/main.py +102 -0
- orga-0.1.0/examples/sample_output.json +3525 -0
- orga-0.1.0/examples/sample_urls.txt +14 -0
- orga-0.1.0/orga/__init__.py +4 -0
- orga-0.1.0/orga/cli/main.py +216 -0
- orga-0.1.0/orga/discover/__init__.py +96 -0
- orga-0.1.0/orga/fetch/__init__.py +4 -0
- orga-0.1.0/orga/fetch/httpx_fetcher.py +119 -0
- orga-0.1.0/orga/fetch/strategy.py +8 -0
- orga-0.1.0/orga/governance/__init__.py +152 -0
- orga-0.1.0/orga/governance/classification_aggregator.py +128 -0
- orga-0.1.0/orga/merge/__init__.py +3 -0
- orga-0.1.0/orga/merge/constants.py +50 -0
- orga-0.1.0/orga/merge/processor.py +333 -0
- orga-0.1.0/orga/model/__init__.py +11 -0
- orga-0.1.0/orga/model/config.py +66 -0
- orga-0.1.0/orga/model/document.py +45 -0
- orga-0.1.0/orga/model/profile.py +88 -0
- orga-0.1.0/orga/model/types.py +20 -0
- orga-0.1.0/orga/parse/fields/__init__.py +7 -0
- orga-0.1.0/orga/parse/fields/address_scorer.py +66 -0
- orga-0.1.0/orga/parse/fields/classifier.py +287 -0
- orga-0.1.0/orga/parse/fields/parsers.py +355 -0
- orga-0.1.0/orga/parse/taxonomy.py +114 -0
- orga-0.1.0/orga/pipeline/__init__.py +222 -0
- orga-0.1.0/orga/registry/__init__.py +54 -0
- orga-0.1.0/pyproject.toml +76 -0
- orga-0.1.0/scripts/verify_services_e2e.py +190 -0
- orga-0.1.0/scripts/verify_services_internal.py +179 -0
- orga-0.1.0/service_test_outcome.txt +25689 -0
- orga-0.1.0/tests/conftest.py +79 -0
- orga-0.1.0/tests/examples/test_extractor_service.py +47 -0
- orga-0.1.0/tests/examples/test_job_service.py +74 -0
- orga-0.1.0/tests/integration/test_cli.py +152 -0
- orga-0.1.0/tests/integration/test_pipeline.py +135 -0
- orga-0.1.0/tests/module/test_address_scoring.py +83 -0
- orga-0.1.0/tests/module/test_classification_advanced.py +95 -0
- orga-0.1.0/tests/module/test_classification_layers.py +125 -0
- orga-0.1.0/tests/module/test_classify.py +114 -0
- orga-0.1.0/tests/module/test_concurrency.py +58 -0
- orga-0.1.0/tests/module/test_discover.py +114 -0
- orga-0.1.0/tests/module/test_fetch.py +137 -0
- orga-0.1.0/tests/module/test_location_consolidation.py +99 -0
- orga-0.1.0/tests/module/test_output_sanitization.py +111 -0
- orga-0.1.0/tests/module/test_parse.py +241 -0
- orga-0.1.0/tests/module/test_parse_address_advanced.py +111 -0
- orga-0.1.0/tests/module/test_sanitization.py +153 -0
- orga-0.1.0/tests/unit/test_governance.py +90 -0
- orga-0.1.0/tests/unit/test_merge.py +146 -0
- orga-0.1.0/tests/unit/test_model.py +172 -0
- orga-0.1.0/tests/unit/test_registry.py +63 -0
orga-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.nox/
|
|
43
|
+
.coverage
|
|
44
|
+
.coverage.*
|
|
45
|
+
.cache
|
|
46
|
+
nosetests.xml
|
|
47
|
+
coverage.xml
|
|
48
|
+
*.cover
|
|
49
|
+
*.py,cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
.pytest_cache/
|
|
52
|
+
cover/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
target/
|
|
76
|
+
|
|
77
|
+
# Jupyter Notebook
|
|
78
|
+
.ipynb_checkpoints
|
|
79
|
+
|
|
80
|
+
# IPython
|
|
81
|
+
profile_default/
|
|
82
|
+
ipython_config.py
|
|
83
|
+
|
|
84
|
+
# pyenv
|
|
85
|
+
.python-version
|
|
86
|
+
|
|
87
|
+
# pipenv
|
|
88
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
89
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
90
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
91
|
+
# install all needed dependencies.
|
|
92
|
+
#Pipfile.lock
|
|
93
|
+
|
|
94
|
+
# poetry
|
|
95
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
96
|
+
# This is especially recommended for binary packages to ensure reproducible builds.
|
|
97
|
+
# However, if you need to support different python versions or platforms, you might want to ignore it.
|
|
98
|
+
#poetry.lock
|
|
99
|
+
|
|
100
|
+
# pdm
|
|
101
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
102
|
+
#pdm.lock
|
|
103
|
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
|
104
|
+
# in version control.
|
|
105
|
+
# https://pdm.fming.dev/#use-with-ide
|
|
106
|
+
.pdm.toml
|
|
107
|
+
.pdm-python
|
|
108
|
+
.pdm-build/
|
|
109
|
+
|
|
110
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and others
|
|
111
|
+
__pypackages__/
|
|
112
|
+
|
|
113
|
+
# Celery stuff
|
|
114
|
+
celerybeat-schedule
|
|
115
|
+
celerybeat.pid
|
|
116
|
+
|
|
117
|
+
# SageMath parsed files
|
|
118
|
+
*.sage.py
|
|
119
|
+
|
|
120
|
+
# Environments
|
|
121
|
+
.env
|
|
122
|
+
.venv
|
|
123
|
+
env/
|
|
124
|
+
venv/
|
|
125
|
+
ENV/
|
|
126
|
+
env.bak/
|
|
127
|
+
venv.bak/
|
|
128
|
+
|
|
129
|
+
# Spyder project settings
|
|
130
|
+
.spyderproject
|
|
131
|
+
.spyproject
|
|
132
|
+
|
|
133
|
+
# Rope project settings
|
|
134
|
+
.ropeproject
|
|
135
|
+
|
|
136
|
+
# mkdocs documentation
|
|
137
|
+
/site
|
|
138
|
+
|
|
139
|
+
# mypy
|
|
140
|
+
.mypy_cache/
|
|
141
|
+
.dmypy.json
|
|
142
|
+
dmypy.json
|
|
143
|
+
|
|
144
|
+
# Pyre type checker
|
|
145
|
+
.pyre/
|
|
146
|
+
|
|
147
|
+
# pytype static type analyzer
|
|
148
|
+
.pytype/
|
|
149
|
+
|
|
150
|
+
# Cython debug symbols
|
|
151
|
+
cython_debug/
|
|
152
|
+
|
|
153
|
+
# Editor directories and files
|
|
154
|
+
.idea/
|
|
155
|
+
.vscode/
|
|
156
|
+
*.swp
|
|
157
|
+
*.swo
|
|
158
|
+
|
|
159
|
+
# Project specific
|
|
160
|
+
GEMINI.md
|
|
161
|
+
设计书.org
|
orga-0.1.0/Dockerfile
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Use an official Python runtime as a parent image
|
|
2
|
+
FROM python:3.12-slim
|
|
3
|
+
|
|
4
|
+
# Set the working directory in the container
|
|
5
|
+
WORKDIR /app
|
|
6
|
+
|
|
7
|
+
# Install system dependencies if any (none for now)
|
|
8
|
+
# RUN apt-get update && apt-get install -y ...
|
|
9
|
+
|
|
10
|
+
# Copy the current directory contents into the container at /app
|
|
11
|
+
COPY . /app
|
|
12
|
+
|
|
13
|
+
# Install build tools and dependencies
|
|
14
|
+
# We use pip to install the package in editable mode or just requirements
|
|
15
|
+
# Since we have pyproject.toml, we can install directly
|
|
16
|
+
RUN pip install --no-cache-dir --upgrade pip && \
|
|
17
|
+
pip install --no-cache-dir . && \
|
|
18
|
+
pip install --no-cache-dir "fastapi" "uvicorn"
|
|
19
|
+
|
|
20
|
+
# Expose ports (documentary only)
|
|
21
|
+
EXPOSE 8000
|
|
22
|
+
EXPOSE 8001
|
|
23
|
+
|
|
24
|
+
# Define environment variables
|
|
25
|
+
ENV PYTHONUNBUFFERED=1
|
|
26
|
+
|
|
27
|
+
# Default entrypoint (can be overridden)
|
|
28
|
+
CMD ["uvicorn", "examples.extractor_service.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
orga-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 KEJIA <DETONG.KEJI@GMAIL.COM>
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
orga-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: orga
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A modular, strategy-driven organization profile extractor.
|
|
5
|
+
Author-email: Xiang Dao <xiangdao@example.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 KEJIA <DETONG.KEJI@GMAIL.COM>
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
License-File: LICENSE
|
|
28
|
+
Classifier: Development Status :: 3 - Alpha
|
|
29
|
+
Classifier: Intended Audience :: Developers
|
|
30
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
31
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
32
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
33
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
34
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
35
|
+
Requires-Python: >=3.10
|
|
36
|
+
Requires-Dist: aiolimiter
|
|
37
|
+
Requires-Dist: email-validator
|
|
38
|
+
Requires-Dist: extruct
|
|
39
|
+
Requires-Dist: hishel
|
|
40
|
+
Requires-Dist: httpx
|
|
41
|
+
Requires-Dist: opentelemetry-api
|
|
42
|
+
Requires-Dist: phonenumbers
|
|
43
|
+
Requires-Dist: pydantic>=2.0
|
|
44
|
+
Requires-Dist: pyyaml
|
|
45
|
+
Requires-Dist: rapidfuzz
|
|
46
|
+
Requires-Dist: selectolax
|
|
47
|
+
Requires-Dist: structlog
|
|
48
|
+
Requires-Dist: tenacity
|
|
49
|
+
Requires-Dist: typer
|
|
50
|
+
Provides-Extra: browser
|
|
51
|
+
Requires-Dist: playwright; extra == 'browser'
|
|
52
|
+
Provides-Extra: dev
|
|
53
|
+
Requires-Dist: black; extra == 'dev'
|
|
54
|
+
Requires-Dist: mypy; extra == 'dev'
|
|
55
|
+
Requires-Dist: pytest; extra == 'dev'
|
|
56
|
+
Requires-Dist: pytest-asyncio; extra == 'dev'
|
|
57
|
+
Requires-Dist: pytest-cov; extra == 'dev'
|
|
58
|
+
Requires-Dist: ruff; extra == 'dev'
|
|
59
|
+
Provides-Extra: llm
|
|
60
|
+
Requires-Dist: langextract; extra == 'llm'
|
|
61
|
+
Provides-Extra: server
|
|
62
|
+
Requires-Dist: fastapi; extra == 'server'
|
|
63
|
+
Requires-Dist: uvicorn; extra == 'server'
|
|
64
|
+
Description-Content-Type: text/markdown
|
|
65
|
+
|
|
66
|
+
# ORGA: Deterministic Organization Profiler
|
|
67
|
+
|
|
68
|
+
**A fast, explainable, non-LLM extraction engine for profiling institutional websites.**
|
|
69
|
+
|
|
70
|
+
---
|
|
71
|
+
|
|
72
|
+
## 🎯 What is ORGA?
|
|
73
|
+
|
|
74
|
+
ORGA is a Python-based profiling engine and microservice suite designed to autonomously navigate an organization's website and extract a highly structured JSON profile. It discovers global locations, extracts clean contact data (phones, emails, social footprints), and determines the organization's primary industry category.
|
|
75
|
+
|
|
76
|
+
**Crucially, ORGA is not an LLM.** It is built entirely on deterministic rules, semantic heuristics, JSON-LD parsing, and lightweight statistical Bayesian models.
|
|
77
|
+
|
|
78
|
+
## ⚡ Output Snapshot
|
|
79
|
+
|
|
80
|
+
*A minimal example of the structured JSON output generated for a hospital website:*
|
|
81
|
+
|
|
82
|
+
```json
|
|
83
|
+
{
|
|
84
|
+
"name": "CHEO",
|
|
85
|
+
"org_type": "Hospital",
|
|
86
|
+
"categories": ["Hospital", "NonProfit"],
|
|
87
|
+
"locations": [
|
|
88
|
+
{
|
|
89
|
+
"address": {
|
|
90
|
+
"raw": "401 Smyth Road, Ottawa ON K1H 8L1",
|
|
91
|
+
"postal_code": "K1H 8L1",
|
|
92
|
+
"city": "Ottawa"
|
|
93
|
+
},
|
|
94
|
+
"confidence": 0.9
|
|
95
|
+
}
|
|
96
|
+
],
|
|
97
|
+
"phones": [
|
|
98
|
+
{ "value": "+16137377600", "kind": "phone" }
|
|
99
|
+
],
|
|
100
|
+
"social_links": [
|
|
101
|
+
{ "value": "https://facebook.com/cheokids", "kind": "social" }
|
|
102
|
+
]
|
|
103
|
+
}
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## 🧠 Why No LLMs?
|
|
107
|
+
|
|
108
|
+
In an era of generative AI, why build a deterministic extractor?
|
|
109
|
+
|
|
110
|
+
1. **Extreme Speed & Cost Efficiency:** ORGA processes a full organization (navigating up to 5 sub-pages like `/about` or `/contact`) in **under 0.7 seconds** per site. It requires negligible CPU/Memory overhead, allowing you to process 10,000 organizations for pennies rather than dollars.
|
|
111
|
+
2. **100% Explainability:** Every extracted phone number, every inferred category (e.g., `Hospital` vs. `University`), and every discarded link is fully traceable. The JSON payload includes a `debug_info` block detailing the exact CSS selector, regex match, or weighted rule path that produced the result.
|
|
112
|
+
3. **No Hallucinations:** When ORGA fails, it fails predictably (e.g., returning an empty field). It will never invent a phone number or confidently hallucinate an office address.
|
|
113
|
+
|
|
114
|
+
## ✨ Core Features
|
|
115
|
+
|
|
116
|
+
* **Intelligent Discovery:** Automatically finds high-value pages (`/contact`, `/locations`, `/about`) from a root URL.
|
|
117
|
+
* **Aggressive Noise Filtering:** Employs suppression matrices and page-weighting to strip out UI navigation noise and generic boilerplate text.
|
|
118
|
+
* **Layered Classification:** Identifies primary institutional types (e.g., `Government`, `Hospital`, `NonProfit`, `InternationalOrg`) using a two-tier weighted keyword and Bayesian frequency model.
|
|
119
|
+
* **Concurrent Microservices:** Includes two Dockerized FastAPI services for real-time single-URL extraction and asynchronous batch processing.
|
|
120
|
+
|
|
121
|
+
## 🛑 Known Boundaries & Limitations
|
|
122
|
+
|
|
123
|
+
ORGA operates at the absolute ceiling of what rule-based extraction can achieve. You should understand its limits:
|
|
124
|
+
|
|
125
|
+
* **Good Fit:** Generating a massive directory of structured contacts and primary categories for standard institutional sites (Hospitals, Universities, NGOs, Government Agencies).
|
|
126
|
+
* **Poor Fit:** Open-world semantic reading tasks, analyzing deep PDF reports, or distinguishing nuanced corporate hierarchies (e.g., distinguishing a holding company from its subsidiary if both use identical website templates).
|
|
127
|
+
* **Address Parsing:** While highly resilient, extracting perfect Street/City/Region splits from unstructured, conversational footers without NLP remains challenging and will occasionally result in `partially_parsed` raw strings.
|
|
128
|
+
|
|
129
|
+
---
|
|
130
|
+
|
|
131
|
+
## 🚀 Quickstart
|
|
132
|
+
|
|
133
|
+
### 1. Run the Microservices via Docker
|
|
134
|
+
|
|
135
|
+
Ensure you have Docker and Docker Compose installed.
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
git clone https://github.com/discretewater/orga.git
|
|
139
|
+
cd orga
|
|
140
|
+
|
|
141
|
+
# Start the Extractor (8000) and Job Manager (8001) services
|
|
142
|
+
docker compose up --build -d
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### 2. Demo: Single Extraction
|
|
146
|
+
|
|
147
|
+
Extract a profile for the World Health Organization:
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
curl -X POST "http://127.0.0.1:8000/extract" \
|
|
151
|
+
-H "Content-Type: application/json" \
|
|
152
|
+
-d '{"url": "https://www.who.int"}' | jq .
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
*You will receive a rich JSON profile containing the WHO's global contact points, social links, and a primary classification of `InternationalOrg`.*
|
|
156
|
+
|
|
157
|
+
### 3. Demo: Batch Processing
|
|
158
|
+
|
|
159
|
+
Submit multiple URLs to the async Job Manager:
|
|
160
|
+
|
|
161
|
+
```bash
|
|
162
|
+
# 1. Submit the Job
|
|
163
|
+
curl -s -X POST "http://127.0.0.1:8001/jobs" \
|
|
164
|
+
-H "Content-Type: application/json" \
|
|
165
|
+
-d '{"urls": ["https://www.harvard.edu", "https://www.cheo.on.ca"]}'
|
|
166
|
+
|
|
167
|
+
# Expected output: {"job_id": "uuid-...", "status": "pending"}
|
|
168
|
+
|
|
169
|
+
# 2. Poll for Results (Replace UUID with the one from the previous step)
|
|
170
|
+
curl -s "http://127.0.0.1:8001/jobs/{job_id}" | jq .
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
---
|
|
174
|
+
|
|
175
|
+
## 🏗️ Architecture Summary
|
|
176
|
+
|
|
177
|
+
1. **Fetcher:** Utilizes `httpx` and `aiolimiter` to aggressively fetch HTML while respecting concurrency limits.
|
|
178
|
+
2. **Discoverer:** Heuristically scores anchor links to branch out from the root domain into contact/about pages.
|
|
179
|
+
3. **Parsers:** `selectolax`-powered extraction targeting DOM zones, JSON-LD schemas, and normalized regex patterns.
|
|
180
|
+
4. **Classifier:** A tiered engine scoring terms across zones (`<title>`, `<h1>`, `<body>`) against a weighted taxonomy.
|
|
181
|
+
5. **Aggregator:** An institution-level decider that weights page evidence, applies suppression rules (e.g., a strong "Hospital" signal suppresses weak "Association" noise), and yields the final profile.
|
|
182
|
+
|
|
183
|
+
## 🔮 Roadmap
|
|
184
|
+
|
|
185
|
+
ORGA M7.1 is currently in a frozen baseline state.
|
|
186
|
+
|
|
187
|
+
Future enhancements will explore adding a **Lightweight Supervised Post-Calibration Model** (e.g., XGBoost over debug scores) to refine category boundaries without sacrificing the speed and determinism of the core extraction layer. We will not be migrating to an LLM-first architecture.
|
orga-0.1.0/README.md
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# ORGA: Deterministic Organization Profiler
|
|
2
|
+
|
|
3
|
+
**A fast, explainable, non-LLM extraction engine for profiling institutional websites.**
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## 🎯 What is ORGA?
|
|
8
|
+
|
|
9
|
+
ORGA is a Python-based profiling engine and microservice suite designed to autonomously navigate an organization's website and extract a highly structured JSON profile. It discovers global locations, extracts clean contact data (phones, emails, social footprints), and determines the organization's primary industry category.
|
|
10
|
+
|
|
11
|
+
**Crucially, ORGA is not an LLM.** It is built entirely on deterministic rules, semantic heuristics, JSON-LD parsing, and lightweight statistical Bayesian models.
|
|
12
|
+
|
|
13
|
+
## ⚡ Output Snapshot
|
|
14
|
+
|
|
15
|
+
*A minimal example of the structured JSON output generated for a hospital website:*
|
|
16
|
+
|
|
17
|
+
```json
|
|
18
|
+
{
|
|
19
|
+
"name": "CHEO",
|
|
20
|
+
"org_type": "Hospital",
|
|
21
|
+
"categories": ["Hospital", "NonProfit"],
|
|
22
|
+
"locations": [
|
|
23
|
+
{
|
|
24
|
+
"address": {
|
|
25
|
+
"raw": "401 Smyth Road, Ottawa ON K1H 8L1",
|
|
26
|
+
"postal_code": "K1H 8L1",
|
|
27
|
+
"city": "Ottawa"
|
|
28
|
+
},
|
|
29
|
+
"confidence": 0.9
|
|
30
|
+
}
|
|
31
|
+
],
|
|
32
|
+
"phones": [
|
|
33
|
+
{ "value": "+16137377600", "kind": "phone" }
|
|
34
|
+
],
|
|
35
|
+
"social_links": [
|
|
36
|
+
{ "value": "https://facebook.com/cheokids", "kind": "social" }
|
|
37
|
+
]
|
|
38
|
+
}
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## 🧠 Why No LLMs?
|
|
42
|
+
|
|
43
|
+
In an era of generative AI, why build a deterministic extractor?
|
|
44
|
+
|
|
45
|
+
1. **Extreme Speed & Cost Efficiency:** ORGA processes a full organization (navigating up to 5 sub-pages like `/about` or `/contact`) in **under 0.7 seconds** per site. It requires negligible CPU/Memory overhead, allowing you to process 10,000 organizations for pennies rather than dollars.
|
|
46
|
+
2. **100% Explainability:** Every extracted phone number, every inferred category (e.g., `Hospital` vs. `University`), and every discarded link is fully traceable. The JSON payload includes a `debug_info` block detailing the exact CSS selector, regex match, or weighted rule path that produced the result.
|
|
47
|
+
3. **No Hallucinations:** When ORGA fails, it fails predictably (e.g., returning an empty field). It will never invent a phone number or confidently hallucinate an office address.
|
|
48
|
+
|
|
49
|
+
## ✨ Core Features
|
|
50
|
+
|
|
51
|
+
* **Intelligent Discovery:** Automatically finds high-value pages (`/contact`, `/locations`, `/about`) from a root URL.
|
|
52
|
+
* **Aggressive Noise Filtering:** Employs suppression matrices and page-weighting to strip out UI navigation noise and generic boilerplate text.
|
|
53
|
+
* **Layered Classification:** Identifies primary institutional types (e.g., `Government`, `Hospital`, `NonProfit`, `InternationalOrg`) using a two-tier weighted keyword and Bayesian frequency model.
|
|
54
|
+
* **Concurrent Microservices:** Includes two Dockerized FastAPI services for real-time single-URL extraction and asynchronous batch processing.
|
|
55
|
+
|
|
56
|
+
## 🛑 Known Boundaries & Limitations
|
|
57
|
+
|
|
58
|
+
ORGA operates at the absolute ceiling of what rule-based extraction can achieve. You should understand its limits:
|
|
59
|
+
|
|
60
|
+
* **Good Fit:** Generating a massive directory of structured contacts and primary categories for standard institutional sites (Hospitals, Universities, NGOs, Government Agencies).
|
|
61
|
+
* **Poor Fit:** Open-world semantic reading tasks, analyzing deep PDF reports, or distinguishing nuanced corporate hierarchies (e.g., distinguishing a holding company from its subsidiary if both use identical website templates).
|
|
62
|
+
* **Address Parsing:** While highly resilient, extracting perfect Street/City/Region splits from unstructured, conversational footers without NLP remains challenging and will occasionally result in `partially_parsed` raw strings.
|
|
63
|
+
|
|
64
|
+
---
|
|
65
|
+
|
|
66
|
+
## 🚀 Quickstart
|
|
67
|
+
|
|
68
|
+
### 1. Run the Microservices via Docker
|
|
69
|
+
|
|
70
|
+
Ensure you have Docker and Docker Compose installed.
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
git clone https://github.com/discretewater/orga.git
|
|
74
|
+
cd orga
|
|
75
|
+
|
|
76
|
+
# Start the Extractor (8000) and Job Manager (8001) services
|
|
77
|
+
docker compose up --build -d
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### 2. Demo: Single Extraction
|
|
81
|
+
|
|
82
|
+
Extract a profile for the World Health Organization:
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
curl -X POST "http://127.0.0.1:8000/extract" \
|
|
86
|
+
-H "Content-Type: application/json" \
|
|
87
|
+
-d '{"url": "https://www.who.int"}' | jq .
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
*You will receive a rich JSON profile containing the WHO's global contact points, social links, and a primary classification of `InternationalOrg`.*
|
|
91
|
+
|
|
92
|
+
### 3. Demo: Batch Processing
|
|
93
|
+
|
|
94
|
+
Submit multiple URLs to the async Job Manager:
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
# 1. Submit the Job
|
|
98
|
+
curl -s -X POST "http://127.0.0.1:8001/jobs" \
|
|
99
|
+
-H "Content-Type: application/json" \
|
|
100
|
+
-d '{"urls": ["https://www.harvard.edu", "https://www.cheo.on.ca"]}'
|
|
101
|
+
|
|
102
|
+
# Expected output: {"job_id": "uuid-...", "status": "pending"}
|
|
103
|
+
|
|
104
|
+
# 2. Poll for Results (Replace UUID with the one from the previous step)
|
|
105
|
+
curl -s "http://127.0.0.1:8001/jobs/{job_id}" | jq .
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
---
|
|
109
|
+
|
|
110
|
+
## 🏗️ Architecture Summary
|
|
111
|
+
|
|
112
|
+
1. **Fetcher:** Utilizes `httpx` and `aiolimiter` to aggressively fetch HTML while respecting concurrency limits.
|
|
113
|
+
2. **Discoverer:** Heuristically scores anchor links to branch out from the root domain into contact/about pages.
|
|
114
|
+
3. **Parsers:** `selectolax`-powered extraction targeting DOM zones, JSON-LD schemas, and normalized regex patterns.
|
|
115
|
+
4. **Classifier:** A tiered engine scoring terms across zones (`<title>`, `<h1>`, `<body>`) against a weighted taxonomy.
|
|
116
|
+
5. **Aggregator:** An institution-level decider that weights page evidence, applies suppression rules (e.g., a strong "Hospital" signal suppresses weak "Association" noise), and yields the final profile.
|
|
117
|
+
|
|
118
|
+
## 🔮 Roadmap
|
|
119
|
+
|
|
120
|
+
ORGA M7.1 is currently in a frozen baseline state.
|
|
121
|
+
|
|
122
|
+
Future enhancements will explore adding a **Lightweight Supervised Post-Calibration Model** (e.g., XGBoost over debug scores) to refine category boundaries without sacrificing the speed and determinism of the core extraction layer. We will not be migrating to an LLM-first architecture.
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
* Address Quality Analysis and Limited Polish Strategy
|
|
2
|
+
|
|
3
|
+
** 1. Current State Assessment
|
|
4
|
+
|
|
5
|
+
The ORGA address parser (M7.1) is resilient but imprecise. It successfully identifies blocks of text containing address-like signals (Postal Codes, Street suffixes) but struggles to delineate the exact boundaries of that block within noisy HTML footers.
|
|
6
|
+
|
|
7
|
+
*** Common Issues:
|
|
8
|
+
- **Bleeding:** The address string often includes preceding navigation headers ("Contact Us", "Locations") or succeeding contact info ("Phone: 555-0199").
|
|
9
|
+
- **Fragmentation:** Multi-line addresses in `<div>` soup are sometimes split into separate entities.
|
|
10
|
+
- **False Positives:** ISO numbers or copyright dates resembling postal codes.
|
|
11
|
+
|
|
12
|
+
** 2. The "Limited Polish" Strategy
|
|
13
|
+
|
|
14
|
+
Per M8.1 directives, we are NOT rewriting the parser. We are applying **generic termination signals**.
|
|
15
|
+
|
|
16
|
+
*** Improved Termination List:
|
|
17
|
+
We define a set of "Hard Stops". If the parser encounters these tokens, it assumes the address block has ended.
|
|
18
|
+
|
|
19
|
+
- *Contact Labels:* "Tel:", "Phone:", "Fax:", "Email:", "Call:"
|
|
20
|
+
- *Social Media:* "Follow us", "Twitter", "Facebook", "Connect"
|
|
21
|
+
- *Legal:* "Copyright", "Rights Reserved", "Privacy Policy"
|
|
22
|
+
- *Navigation:* "Menu", "Home", "About", "Services"
|
|
23
|
+
|
|
24
|
+
** 3. Remaining (Acceptable) Artifacts
|
|
25
|
+
|
|
26
|
+
Even with these stops, some noise is inevitable.
|
|
27
|
+
- *Inline Descriptions:* "Main Entrance at 123 Main St." -> "Main Entrance at" is hard to separate without NLP.
|
|
28
|
+
- *PO Box Variations:* Highly variable formats for P.O. Boxes may be partially captured.
|
|
29
|
+
|
|
30
|
+
These are considered **Acceptable Variances** for a heuristic system.
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
* ORGA Architecture Overview
|
|
2
|
+
|
|
3
|
+
** 1. System Context
|
|
4
|
+
|
|
5
|
+
ORGA is a standalone extraction engine. It takes a URL (or list of URLs) as input and outputs a structured JSON profile. It can run as a CLI tool or as a set of Dockerized microservices.
|
|
6
|
+
|
|
7
|
+
** 2. Core Components
|
|
8
|
+
|
|
9
|
+
*** 1. The Fetcher (Network Layer)
|
|
10
|
+
- **Tech:** `httpx`, `aiolimiter`
|
|
11
|
+
- **Role:** Handles HTTP requests, retries, user-agent rotation, and concurrency limiting.
|
|
12
|
+
- **Key Feature:** "Per-host" throttling to prevent banning.
|
|
13
|
+
|
|
14
|
+
*** 2. The Discoverer (Navigation Layer)
|
|
15
|
+
- **Tech:** `selectolax`
|
|
16
|
+
- **Role:** Analyzes the landing page to find "high-value" sub-pages (`/contact`, `/about`, `/locations`).
|
|
17
|
+
- **Logic:** Uses weighted keyword heuristics on anchor tags to prioritize navigation.
|
|
18
|
+
|
|
19
|
+
*** 3. The Parser (Extraction Layer)
|
|
20
|
+
- **Tech:** `selectolax`, `phonenumbers`, `email-validator`
|
|
21
|
+
- **Role:** Extracts raw entities from the DOM.
|
|
22
|
+
- **Strategies:**
|
|
23
|
+
- *JSON-LD:* Extracts structured schema.org data (High confidence).
|
|
24
|
+
- *Regex:* Scans text for phones/emails (Medium confidence).
|
|
25
|
+
- *Heuristic DOM:* Scans footers/contact blocks for addresses.
|
|
26
|
+
|
|
27
|
+
*** 4. The Classifier (Intelligence Layer)
|
|
28
|
+
- **Tech:** Custom Weighted Rule Engine + Bayesian Fallback
|
|
29
|
+
- **Role:** Determines the organization type (e.g., Hospital, University).
|
|
30
|
+
- **Process:**
|
|
31
|
+
1. *Page Scoring:* Scores each page against a taxonomy (Tier 1 Rules).
|
|
32
|
+
2. *Aggregation:* `ClassificationAggregator` combines scores, applying weights (About > News).
|
|
33
|
+
3. *Suppression:* High-confidence tags (Hospital) suppress low-confidence noise (Association).
|
|
34
|
+
|
|
35
|
+
*** 5. The Merger (Governance Layer)
|
|
36
|
+
- **Role:** The final gatekeeper.
|
|
37
|
+
- **Tasks:** Deduplicates contacts, normalizes phone numbers (E.164), and filters out junk (e.g., social links to "twitter.com/share").
|
|
38
|
+
|
|
39
|
+
** 3. Microservice Design
|
|
40
|
+
|
|
41
|
+
The system is split into two containers for scalability:
|
|
42
|
+
|
|
43
|
+
- **Extractor Service:** Stateless. Handles single requests. Good for scaling horizontally.
|
|
44
|
+
- **Job Service:** Stateful (in-memory or Redis). Manages long-running batches and tracks progress.
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
* ORGA Demo Walkthrough
|
|
2
|
+
|
|
3
|
+
This guide provides a step-by-step walkthrough for demonstrating ORGA to stakeholders or running it as a proof of concept.
|
|
4
|
+
|
|
5
|
+
** 1. Setup (5 Minutes)
|
|
6
|
+
|
|
7
|
+
**Prerequisites:** Docker and Docker Compose.
|
|
8
|
+
|
|
9
|
+
1. **Clone and Enter:**
|
|
10
|
+
#+BEGIN_SRC bash
|
|
11
|
+
git clone https://github.com/discretewater/orga.git
|
|
12
|
+
cd orga
|
|
13
|
+
#+END_SRC
|
|
14
|
+
|
|
15
|
+
2. **Launch Services:**
|
|
16
|
+
#+BEGIN_SRC bash
|
|
17
|
+
docker compose up --build -d
|
|
18
|
+
#+END_SRC
|
|
19
|
+
|
|
20
|
+
3. **Verify Health:**
|
|
21
|
+
#+BEGIN_SRC bash
|
|
22
|
+
curl localhost:8000/health
|
|
23
|
+
# Returns: {"status":"ok"}
|
|
24
|
+
#+END_SRC
|
|
25
|
+
|
|
26
|
+
** 2. Scenario A: The "Real-Time" Lookup
|
|
27
|
+
|
|
28
|
+
*Goal:* Show how fast ORGA profiles a single site.
|
|
29
|
+
|
|
30
|
+
**Action:**
|
|
31
|
+
Run this command to profile the WHO:
|
|
32
|
+
#+BEGIN_SRC bash
|
|
33
|
+
curl -X POST "http://127.0.0.1:8000/extract" \
|
|
34
|
+
-H "Content-Type: application/json" \
|
|
35
|
+
-d '{"url": "https://www.who.int"}' | jq .
|
|
36
|
+
#+END_SRC
|
|
37
|
+
|
|
38
|
+
**Talking Points:**
|
|
39
|
+
- Point out the `org_type`: "InternationalOrg".
|
|
40
|
+
- Show the `social_links`: Clean, validated URLs.
|
|
41
|
+
- Highlight speed: "This took < 1 second."
|
|
42
|
+
|
|
43
|
+
** 3. Scenario B: The "Batch Ingestion"
|
|
44
|
+
|
|
45
|
+
*Goal:* Demonstrate robustness and async processing.
|
|
46
|
+
|
|
47
|
+
**Action:**
|
|
48
|
+
1. Submit a job with a mix of site types:
|
|
49
|
+
#+BEGIN_SRC bash
|
|
50
|
+
curl -X POST "http://127.0.0.1:8001/jobs" \
|
|
51
|
+
-H "Content-Type: application/json" \
|
|
52
|
+
-d '{ "urls": [
|
|
53
|
+
"https://www.harvard.edu",
|
|
54
|
+
"https://www.cheo.on.ca",
|
|
55
|
+
"https://www.greenpeace.org"
|
|
56
|
+
]}'
|
|
57
|
+
#+END_SRC
|
|
58
|
+
*Copy the `job_id` returned.*
|
|
59
|
+
|
|
60
|
+
2. Poll for results:
|
|
61
|
+
#+BEGIN_SRC bash
|
|
62
|
+
curl "http://127.0.0.1:8001/jobs/<YOUR_JOB_ID>" | jq .
|
|
63
|
+
#+END_SRC
|
|
64
|
+
|
|
65
|
+
**Talking Points:**
|
|
66
|
+
- ORGA handles concurrency automatically.
|
|
67
|
+
- It respects rate limits per host.
|
|
68
|
+
- It survives blocked pages without crashing the batch.
|
|
69
|
+
|
|
70
|
+
** 4. Interpreting the "Black Box" (Debug Info)
|
|
71
|
+
|
|
72
|
+
In the JSON output, scroll down to `debug_info`.
|
|
73
|
+
- Show `classification_debug`.
|
|
74
|
+
- Explain how ORGA weighed the `/about` page higher than the `/news` page to determine the category.
|
|
75
|
+
- This proves the system is **deterministic**, not a "black box" AI.
|