orga 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. orga-0.1.0/.gitignore +161 -0
  2. orga-0.1.0/Dockerfile +28 -0
  3. orga-0.1.0/LICENSE +21 -0
  4. orga-0.1.0/PKG-INFO +187 -0
  5. orga-0.1.0/README.md +122 -0
  6. orga-0.1.0/design/address-quality-notes.org +30 -0
  7. orga-0.1.0/design/architecture-overview.org +44 -0
  8. orga-0.1.0/design/demo-walkthrough.org +75 -0
  9. orga-0.1.0/design/engineering-boundaries.org +46 -0
  10. orga-0.1.0/design/known-limitations.org +18 -0
  11. orga-0.1.0/design/lightweight-model-augmentation-plan.org +34 -0
  12. orga-0.1.0/design/m7_1_acceptance_report.org +30 -0
  13. orga-0.1.0/design/problem-family-guidelines.org +43 -0
  14. orga-0.1.0/design/result-interpretation.org +37 -0
  15. orga-0.1.0/docker-compose.yml +28 -0
  16. orga-0.1.0/docs/known-issues.org +28 -0
  17. orga-0.1.0/docs/service-demo.org +91 -0
  18. orga-0.1.0/examples/demo_commands.sh +61 -0
  19. orga-0.1.0/examples/extractor_service/__init__.py +0 -0
  20. orga-0.1.0/examples/extractor_service/main.py +49 -0
  21. orga-0.1.0/examples/fixtures/batch_urls.txt +26 -0
  22. orga-0.1.0/examples/job_service/__init__.py +0 -0
  23. orga-0.1.0/examples/job_service/main.py +102 -0
  24. orga-0.1.0/examples/sample_output.json +3525 -0
  25. orga-0.1.0/examples/sample_urls.txt +14 -0
  26. orga-0.1.0/orga/__init__.py +4 -0
  27. orga-0.1.0/orga/cli/main.py +216 -0
  28. orga-0.1.0/orga/discover/__init__.py +96 -0
  29. orga-0.1.0/orga/fetch/__init__.py +4 -0
  30. orga-0.1.0/orga/fetch/httpx_fetcher.py +119 -0
  31. orga-0.1.0/orga/fetch/strategy.py +8 -0
  32. orga-0.1.0/orga/governance/__init__.py +152 -0
  33. orga-0.1.0/orga/governance/classification_aggregator.py +128 -0
  34. orga-0.1.0/orga/merge/__init__.py +3 -0
  35. orga-0.1.0/orga/merge/constants.py +50 -0
  36. orga-0.1.0/orga/merge/processor.py +333 -0
  37. orga-0.1.0/orga/model/__init__.py +11 -0
  38. orga-0.1.0/orga/model/config.py +66 -0
  39. orga-0.1.0/orga/model/document.py +45 -0
  40. orga-0.1.0/orga/model/profile.py +88 -0
  41. orga-0.1.0/orga/model/types.py +20 -0
  42. orga-0.1.0/orga/parse/fields/__init__.py +7 -0
  43. orga-0.1.0/orga/parse/fields/address_scorer.py +66 -0
  44. orga-0.1.0/orga/parse/fields/classifier.py +287 -0
  45. orga-0.1.0/orga/parse/fields/parsers.py +355 -0
  46. orga-0.1.0/orga/parse/taxonomy.py +114 -0
  47. orga-0.1.0/orga/pipeline/__init__.py +222 -0
  48. orga-0.1.0/orga/registry/__init__.py +54 -0
  49. orga-0.1.0/pyproject.toml +76 -0
  50. orga-0.1.0/scripts/verify_services_e2e.py +190 -0
  51. orga-0.1.0/scripts/verify_services_internal.py +179 -0
  52. orga-0.1.0/service_test_outcome.txt +25689 -0
  53. orga-0.1.0/tests/conftest.py +79 -0
  54. orga-0.1.0/tests/examples/test_extractor_service.py +47 -0
  55. orga-0.1.0/tests/examples/test_job_service.py +74 -0
  56. orga-0.1.0/tests/integration/test_cli.py +152 -0
  57. orga-0.1.0/tests/integration/test_pipeline.py +135 -0
  58. orga-0.1.0/tests/module/test_address_scoring.py +83 -0
  59. orga-0.1.0/tests/module/test_classification_advanced.py +95 -0
  60. orga-0.1.0/tests/module/test_classification_layers.py +125 -0
  61. orga-0.1.0/tests/module/test_classify.py +114 -0
  62. orga-0.1.0/tests/module/test_concurrency.py +58 -0
  63. orga-0.1.0/tests/module/test_discover.py +114 -0
  64. orga-0.1.0/tests/module/test_fetch.py +137 -0
  65. orga-0.1.0/tests/module/test_location_consolidation.py +99 -0
  66. orga-0.1.0/tests/module/test_output_sanitization.py +111 -0
  67. orga-0.1.0/tests/module/test_parse.py +241 -0
  68. orga-0.1.0/tests/module/test_parse_address_advanced.py +111 -0
  69. orga-0.1.0/tests/module/test_sanitization.py +153 -0
  70. orga-0.1.0/tests/unit/test_governance.py +90 -0
  71. orga-0.1.0/tests/unit/test_merge.py +146 -0
  72. orga-0.1.0/tests/unit/test_model.py +172 -0
  73. orga-0.1.0/tests/unit/test_registry.py +63 -0
orga-0.1.0/.gitignore ADDED
@@ -0,0 +1,161 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # poetry
95
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
96
+ # This is especially recommended for binary packages to ensure reproducible builds.
97
+ # However, if you need to support different python versions or platforms, you might want to ignore it.
98
+ #poetry.lock
99
+
100
+ # pdm
101
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
102
+ #pdm.lock
103
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
104
+ # in version control.
105
+ # https://pdm.fming.dev/#use-with-ide
106
+ .pdm.toml
107
+ .pdm-python
108
+ .pdm-build/
109
+
110
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and others
111
+ __pypackages__/
112
+
113
+ # Celery stuff
114
+ celerybeat-schedule
115
+ celerybeat.pid
116
+
117
+ # SageMath parsed files
118
+ *.sage.py
119
+
120
+ # Environments
121
+ .env
122
+ .venv
123
+ env/
124
+ venv/
125
+ ENV/
126
+ env.bak/
127
+ venv.bak/
128
+
129
+ # Spyder project settings
130
+ .spyderproject
131
+ .spyproject
132
+
133
+ # Rope project settings
134
+ .ropeproject
135
+
136
+ # mkdocs documentation
137
+ /site
138
+
139
+ # mypy
140
+ .mypy_cache/
141
+ .dmypy.json
142
+ dmypy.json
143
+
144
+ # Pyre type checker
145
+ .pyre/
146
+
147
+ # pytype static type analyzer
148
+ .pytype/
149
+
150
+ # Cython debug symbols
151
+ cython_debug/
152
+
153
+ # Editor directories and files
154
+ .idea/
155
+ .vscode/
156
+ *.swp
157
+ *.swo
158
+
159
+ # Project specific
160
+ GEMINI.md
161
+ 设计书.org
orga-0.1.0/Dockerfile ADDED
@@ -0,0 +1,28 @@
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.12-slim
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /app
6
+
7
+ # Install system dependencies if any (none for now)
8
+ # RUN apt-get update && apt-get install -y ...
9
+
10
+ # Copy the current directory contents into the container at /app
11
+ COPY . /app
12
+
13
+ # Install build tools and dependencies
14
+ # We use pip to install the package in editable mode or just requirements
15
+ # Since we have pyproject.toml, we can install directly
16
+ RUN pip install --no-cache-dir --upgrade pip && \
17
+ pip install --no-cache-dir . && \
18
+ pip install --no-cache-dir "fastapi" "uvicorn"
19
+
20
+ # Expose ports (documentary only)
21
+ EXPOSE 8000
22
+ EXPOSE 8001
23
+
24
+ # Define environment variables
25
+ ENV PYTHONUNBUFFERED=1
26
+
27
+ # Default entrypoint (can be overridden)
28
+ CMD ["uvicorn", "examples.extractor_service.main:app", "--host", "0.0.0.0", "--port", "8000"]
orga-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 KEJIA <DETONG.KEJI@GMAIL.COM>
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
orga-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,187 @@
1
+ Metadata-Version: 2.4
2
+ Name: orga
3
+ Version: 0.1.0
4
+ Summary: A modular, strategy-driven organization profile extractor.
5
+ Author-email: Xiang Dao <xiangdao@example.com>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2026 KEJIA <DETONG.KEJI@GMAIL.COM>
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+ License-File: LICENSE
28
+ Classifier: Development Status :: 3 - Alpha
29
+ Classifier: Intended Audience :: Developers
30
+ Classifier: Programming Language :: Python :: 3.10
31
+ Classifier: Programming Language :: Python :: 3.11
32
+ Classifier: Programming Language :: Python :: 3.12
33
+ Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
34
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
35
+ Requires-Python: >=3.10
36
+ Requires-Dist: aiolimiter
37
+ Requires-Dist: email-validator
38
+ Requires-Dist: extruct
39
+ Requires-Dist: hishel
40
+ Requires-Dist: httpx
41
+ Requires-Dist: opentelemetry-api
42
+ Requires-Dist: phonenumbers
43
+ Requires-Dist: pydantic>=2.0
44
+ Requires-Dist: pyyaml
45
+ Requires-Dist: rapidfuzz
46
+ Requires-Dist: selectolax
47
+ Requires-Dist: structlog
48
+ Requires-Dist: tenacity
49
+ Requires-Dist: typer
50
+ Provides-Extra: browser
51
+ Requires-Dist: playwright; extra == 'browser'
52
+ Provides-Extra: dev
53
+ Requires-Dist: black; extra == 'dev'
54
+ Requires-Dist: mypy; extra == 'dev'
55
+ Requires-Dist: pytest; extra == 'dev'
56
+ Requires-Dist: pytest-asyncio; extra == 'dev'
57
+ Requires-Dist: pytest-cov; extra == 'dev'
58
+ Requires-Dist: ruff; extra == 'dev'
59
+ Provides-Extra: llm
60
+ Requires-Dist: langextract; extra == 'llm'
61
+ Provides-Extra: server
62
+ Requires-Dist: fastapi; extra == 'server'
63
+ Requires-Dist: uvicorn; extra == 'server'
64
+ Description-Content-Type: text/markdown
65
+
66
+ # ORGA: Deterministic Organization Profiler
67
+
68
+ **A fast, explainable, non-LLM extraction engine for profiling institutional websites.**
69
+
70
+ ---
71
+
72
+ ## 🎯 What is ORGA?
73
+
74
+ ORGA is a Python-based profiling engine and microservice suite designed to autonomously navigate an organization's website and extract a highly structured JSON profile. It discovers global locations, extracts clean contact data (phones, emails, social footprints), and determines the organization's primary industry category.
75
+
76
+ **Crucially, ORGA is not an LLM.** It is built entirely on deterministic rules, semantic heuristics, JSON-LD parsing, and lightweight statistical Bayesian models.
77
+
78
+ ## ⚡ Output Snapshot
79
+
80
+ *A minimal example of the structured JSON output generated for a hospital website:*
81
+
82
+ ```json
83
+ {
84
+ "name": "CHEO",
85
+ "org_type": "Hospital",
86
+ "categories": ["Hospital", "NonProfit"],
87
+ "locations": [
88
+ {
89
+ "address": {
90
+ "raw": "401 Smyth Road, Ottawa ON K1H 8L1",
91
+ "postal_code": "K1H 8L1",
92
+ "city": "Ottawa"
93
+ },
94
+ "confidence": 0.9
95
+ }
96
+ ],
97
+ "phones": [
98
+ { "value": "+16137377600", "kind": "phone" }
99
+ ],
100
+ "social_links": [
101
+ { "value": "https://facebook.com/cheokids", "kind": "social" }
102
+ ]
103
+ }
104
+ ```
105
+
106
+ ## 🧠 Why No LLMs?
107
+
108
+ In an era of generative AI, why build a deterministic extractor?
109
+
110
+ 1. **Extreme Speed & Cost Efficiency:** ORGA processes a full organization (navigating up to 5 sub-pages like `/about` or `/contact`) in **under 0.7 seconds** per site. It requires negligible CPU/Memory overhead, allowing you to process 10,000 organizations for pennies rather than dollars.
111
+ 2. **100% Explainability:** Every extracted phone number, every inferred category (e.g., `Hospital` vs. `University`), and every discarded link is fully traceable. The JSON payload includes a `debug_info` block detailing the exact CSS selector, regex match, or weighted rule path that produced the result.
112
+ 3. **No Hallucinations:** When ORGA fails, it fails predictably (e.g., returning an empty field). It will never invent a phone number or confidently hallucinate an office address.
113
+
114
+ ## ✨ Core Features
115
+
116
+ * **Intelligent Discovery:** Automatically finds high-value pages (`/contact`, `/locations`, `/about`) from a root URL.
117
+ * **Aggressive Noise Filtering:** Employs suppression matrices and page-weighting to strip out UI navigation noise and generic boilerplate text.
118
+ * **Layered Classification:** Identifies primary institutional types (e.g., `Government`, `Hospital`, `NonProfit`, `InternationalOrg`) using a two-tier weighted keyword and Bayesian frequency model.
119
+ * **Concurrent Microservices:** Includes two Dockerized FastAPI services for real-time single-URL extraction and asynchronous batch processing.
120
+
121
+ ## 🛑 Known Boundaries & Limitations
122
+
123
+ ORGA operates at the absolute ceiling of what rule-based extraction can achieve. You should understand its limits:
124
+
125
+ * **Good Fit:** Generating a massive directory of structured contacts and primary categories for standard institutional sites (Hospitals, Universities, NGOs, Government Agencies).
126
+ * **Poor Fit:** Open-world semantic reading tasks, analyzing deep PDF reports, or distinguishing nuanced corporate hierarchies (e.g., distinguishing a holding company from its subsidiary if both use identical website templates).
127
+ * **Address Parsing:** While highly resilient, extracting perfect Street/City/Region splits from unstructured, conversational footers without NLP remains challenging and will occasionally result in `partially_parsed` raw strings.
128
+
129
+ ---
130
+
131
+ ## 🚀 Quickstart
132
+
133
+ ### 1. Run the Microservices via Docker
134
+
135
+ Ensure you have Docker and Docker Compose installed.
136
+
137
+ ```bash
138
+ git clone https://github.com/discretewater/orga.git
139
+ cd orga
140
+
141
+ # Start the Extractor (8000) and Job Manager (8001) services
142
+ docker compose up --build -d
143
+ ```
144
+
145
+ ### 2. Demo: Single Extraction
146
+
147
+ Extract a profile for the World Health Organization:
148
+
149
+ ```bash
150
+ curl -X POST "http://127.0.0.1:8000/extract" \
151
+ -H "Content-Type: application/json" \
152
+ -d '{"url": "https://www.who.int"}' | jq .
153
+ ```
154
+
155
+ *You will receive a rich JSON profile containing the WHO's global contact points, social links, and a primary classification of `InternationalOrg`.*
156
+
157
+ ### 3. Demo: Batch Processing
158
+
159
+ Submit multiple URLs to the async Job Manager:
160
+
161
+ ```bash
162
+ # 1. Submit the Job
163
+ curl -s -X POST "http://127.0.0.1:8001/jobs" \
164
+ -H "Content-Type: application/json" \
165
+ -d '{"urls": ["https://www.harvard.edu", "https://www.cheo.on.ca"]}'
166
+
167
+ # Expected output: {"job_id": "uuid-...", "status": "pending"}
168
+
169
+ # 2. Poll for Results (Replace UUID with the one from the previous step)
170
+ curl -s "http://127.0.0.1:8001/jobs/{job_id}" | jq .
171
+ ```
172
+
173
+ ---
174
+
175
+ ## 🏗️ Architecture Summary
176
+
177
+ 1. **Fetcher:** Utilizes `httpx` and `aiolimiter` to aggressively fetch HTML while respecting concurrency limits.
178
+ 2. **Discoverer:** Heuristically scores anchor links to branch out from the root domain into contact/about pages.
179
+ 3. **Parsers:** `selectolax`-powered extraction targeting DOM zones, JSON-LD schemas, and normalized regex patterns.
180
+ 4. **Classifier:** A tiered engine scoring terms across zones (`<title>`, `<h1>`, `<body>`) against a weighted taxonomy.
181
+ 5. **Aggregator:** An institution-level decider that weights page evidence, applies suppression rules (e.g., a strong "Hospital" signal suppresses weak "Association" noise), and yields the final profile.
182
+
183
+ ## 🔮 Roadmap
184
+
185
+ ORGA M7.1 is currently in a frozen baseline state.
186
+
187
+ Future enhancements will explore adding a **Lightweight Supervised Post-Calibration Model** (e.g., XGBoost over debug scores) to refine category boundaries without sacrificing the speed and determinism of the core extraction layer. We will not be migrating to an LLM-first architecture.
orga-0.1.0/README.md ADDED
@@ -0,0 +1,122 @@
1
+ # ORGA: Deterministic Organization Profiler
2
+
3
+ **A fast, explainable, non-LLM extraction engine for profiling institutional websites.**
4
+
5
+ ---
6
+
7
+ ## 🎯 What is ORGA?
8
+
9
+ ORGA is a Python-based profiling engine and microservice suite designed to autonomously navigate an organization's website and extract a highly structured JSON profile. It discovers global locations, extracts clean contact data (phones, emails, social footprints), and determines the organization's primary industry category.
10
+
11
+ **Crucially, ORGA is not an LLM.** It is built entirely on deterministic rules, semantic heuristics, JSON-LD parsing, and lightweight statistical Bayesian models.
12
+
13
+ ## ⚡ Output Snapshot
14
+
15
+ *A minimal example of the structured JSON output generated for a hospital website:*
16
+
17
+ ```json
18
+ {
19
+ "name": "CHEO",
20
+ "org_type": "Hospital",
21
+ "categories": ["Hospital", "NonProfit"],
22
+ "locations": [
23
+ {
24
+ "address": {
25
+ "raw": "401 Smyth Road, Ottawa ON K1H 8L1",
26
+ "postal_code": "K1H 8L1",
27
+ "city": "Ottawa"
28
+ },
29
+ "confidence": 0.9
30
+ }
31
+ ],
32
+ "phones": [
33
+ { "value": "+16137377600", "kind": "phone" }
34
+ ],
35
+ "social_links": [
36
+ { "value": "https://facebook.com/cheokids", "kind": "social" }
37
+ ]
38
+ }
39
+ ```
40
+
41
+ ## 🧠 Why No LLMs?
42
+
43
+ In an era of generative AI, why build a deterministic extractor?
44
+
45
+ 1. **Extreme Speed & Cost Efficiency:** ORGA processes a full organization (navigating up to 5 sub-pages like `/about` or `/contact`) in **under 0.7 seconds** per site. It requires negligible CPU/Memory overhead, allowing you to process 10,000 organizations for pennies rather than dollars.
46
+ 2. **100% Explainability:** Every extracted phone number, every inferred category (e.g., `Hospital` vs. `University`), and every discarded link is fully traceable. The JSON payload includes a `debug_info` block detailing the exact CSS selector, regex match, or weighted rule path that produced the result.
47
+ 3. **No Hallucinations:** When ORGA fails, it fails predictably (e.g., returning an empty field). It will never invent a phone number or confidently hallucinate an office address.
48
+
49
+ ## ✨ Core Features
50
+
51
+ * **Intelligent Discovery:** Automatically finds high-value pages (`/contact`, `/locations`, `/about`) from a root URL.
52
+ * **Aggressive Noise Filtering:** Employs suppression matrices and page-weighting to strip out UI navigation noise and generic boilerplate text.
53
+ * **Layered Classification:** Identifies primary institutional types (e.g., `Government`, `Hospital`, `NonProfit`, `InternationalOrg`) using a two-tier weighted keyword and Bayesian frequency model.
54
+ * **Concurrent Microservices:** Includes two Dockerized FastAPI services for real-time single-URL extraction and asynchronous batch processing.
55
+
56
+ ## 🛑 Known Boundaries & Limitations
57
+
58
+ ORGA operates at the absolute ceiling of what rule-based extraction can achieve. You should understand its limits:
59
+
60
+ * **Good Fit:** Generating a massive directory of structured contacts and primary categories for standard institutional sites (Hospitals, Universities, NGOs, Government Agencies).
61
+ * **Poor Fit:** Open-world semantic reading tasks, analyzing deep PDF reports, or distinguishing nuanced corporate hierarchies (e.g., distinguishing a holding company from its subsidiary if both use identical website templates).
62
+ * **Address Parsing:** While highly resilient, extracting perfect Street/City/Region splits from unstructured, conversational footers without NLP remains challenging and will occasionally result in `partially_parsed` raw strings.
63
+
64
+ ---
65
+
66
+ ## 🚀 Quickstart
67
+
68
+ ### 1. Run the Microservices via Docker
69
+
70
+ Ensure you have Docker and Docker Compose installed.
71
+
72
+ ```bash
73
+ git clone https://github.com/discretewater/orga.git
74
+ cd orga
75
+
76
+ # Start the Extractor (8000) and Job Manager (8001) services
77
+ docker compose up --build -d
78
+ ```
79
+
80
+ ### 2. Demo: Single Extraction
81
+
82
+ Extract a profile for the World Health Organization:
83
+
84
+ ```bash
85
+ curl -X POST "http://127.0.0.1:8000/extract" \
86
+ -H "Content-Type: application/json" \
87
+ -d '{"url": "https://www.who.int"}' | jq .
88
+ ```
89
+
90
+ *You will receive a rich JSON profile containing the WHO's global contact points, social links, and a primary classification of `InternationalOrg`.*
91
+
92
+ ### 3. Demo: Batch Processing
93
+
94
+ Submit multiple URLs to the async Job Manager:
95
+
96
+ ```bash
97
+ # 1. Submit the Job
98
+ curl -s -X POST "http://127.0.0.1:8001/jobs" \
99
+ -H "Content-Type: application/json" \
100
+ -d '{"urls": ["https://www.harvard.edu", "https://www.cheo.on.ca"]}'
101
+
102
+ # Expected output: {"job_id": "uuid-...", "status": "pending"}
103
+
104
+ # 2. Poll for Results (Replace UUID with the one from the previous step)
105
+ curl -s "http://127.0.0.1:8001/jobs/{job_id}" | jq .
106
+ ```
107
+
108
+ ---
109
+
110
+ ## 🏗️ Architecture Summary
111
+
112
+ 1. **Fetcher:** Utilizes `httpx` and `aiolimiter` to aggressively fetch HTML while respecting concurrency limits.
113
+ 2. **Discoverer:** Heuristically scores anchor links to branch out from the root domain into contact/about pages.
114
+ 3. **Parsers:** `selectolax`-powered extraction targeting DOM zones, JSON-LD schemas, and normalized regex patterns.
115
+ 4. **Classifier:** A tiered engine scoring terms across zones (`<title>`, `<h1>`, `<body>`) against a weighted taxonomy.
116
+ 5. **Aggregator:** An institution-level decider that weights page evidence, applies suppression rules (e.g., a strong "Hospital" signal suppresses weak "Association" noise), and yields the final profile.
117
+
118
+ ## 🔮 Roadmap
119
+
120
+ ORGA M7.1 is currently in a frozen baseline state.
121
+
122
+ Future enhancements will explore adding a **Lightweight Supervised Post-Calibration Model** (e.g., XGBoost over debug scores) to refine category boundaries without sacrificing the speed and determinism of the core extraction layer. We will not be migrating to an LLM-first architecture.
@@ -0,0 +1,30 @@
1
+ * Address Quality Analysis and Limited Polish Strategy
2
+
3
+ ** 1. Current State Assessment
4
+
5
+ The ORGA address parser (M7.1) is resilient but imprecise. It successfully identifies blocks of text containing address-like signals (Postal Codes, Street suffixes) but struggles to delineate the exact boundaries of that block within noisy HTML footers.
6
+
7
+ *** Common Issues:
8
+ - **Bleeding:** The address string often includes preceding navigation headers ("Contact Us", "Locations") or succeeding contact info ("Phone: 555-0199").
9
+ - **Fragmentation:** Multi-line addresses in `<div>` soup are sometimes split into separate entities.
10
+ - **False Positives:** ISO numbers or copyright dates resembling postal codes.
11
+
12
+ ** 2. The "Limited Polish" Strategy
13
+
14
+ Per M8.1 directives, we are NOT rewriting the parser. We are applying **generic termination signals**.
15
+
16
+ *** Improved Termination List:
17
+ We define a set of "Hard Stops". If the parser encounters these tokens, it assumes the address block has ended.
18
+
19
+ - *Contact Labels:* "Tel:", "Phone:", "Fax:", "Email:", "Call:"
20
+ - *Social Media:* "Follow us", "Twitter", "Facebook", "Connect"
21
+ - *Legal:* "Copyright", "Rights Reserved", "Privacy Policy"
22
+ - *Navigation:* "Menu", "Home", "About", "Services"
23
+
24
+ ** 3. Remaining (Acceptable) Artifacts
25
+
26
+ Even with these stops, some noise is inevitable.
27
+ - *Inline Descriptions:* "Main Entrance at 123 Main St." -> "Main Entrance at" is hard to separate without NLP.
28
+ - *PO Box Variations:* Highly variable formats for P.O. Boxes may be partially captured.
29
+
30
+ These are considered **Acceptable Variances** for a heuristic system.
@@ -0,0 +1,44 @@
1
+ * ORGA Architecture Overview
2
+
3
+ ** 1. System Context
4
+
5
+ ORGA is a standalone extraction engine. It takes a URL (or list of URLs) as input and outputs a structured JSON profile. It can run as a CLI tool or as a set of Dockerized microservices.
6
+
7
+ ** 2. Core Components
8
+
9
+ *** 1. The Fetcher (Network Layer)
10
+ - **Tech:** `httpx`, `aiolimiter`
11
+ - **Role:** Handles HTTP requests, retries, user-agent rotation, and concurrency limiting.
12
+ - **Key Feature:** "Per-host" throttling to prevent banning.
13
+
14
+ *** 2. The Discoverer (Navigation Layer)
15
+ - **Tech:** `selectolax`
16
+ - **Role:** Analyzes the landing page to find "high-value" sub-pages (`/contact`, `/about`, `/locations`).
17
+ - **Logic:** Uses weighted keyword heuristics on anchor tags to prioritize navigation.
18
+
19
+ *** 3. The Parser (Extraction Layer)
20
+ - **Tech:** `selectolax`, `phonenumbers`, `email-validator`
21
+ - **Role:** Extracts raw entities from the DOM.
22
+ - **Strategies:**
23
+ - *JSON-LD:* Extracts structured schema.org data (High confidence).
24
+ - *Regex:* Scans text for phones/emails (Medium confidence).
25
+ - *Heuristic DOM:* Scans footers/contact blocks for addresses.
26
+
27
+ *** 4. The Classifier (Intelligence Layer)
28
+ - **Tech:** Custom Weighted Rule Engine + Bayesian Fallback
29
+ - **Role:** Determines the organization type (e.g., Hospital, University).
30
+ - **Process:**
31
+ 1. *Page Scoring:* Scores each page against a taxonomy (Tier 1 Rules).
32
+ 2. *Aggregation:* `ClassificationAggregator` combines scores, applying weights (About > News).
33
+ 3. *Suppression:* High-confidence tags (Hospital) suppress low-confidence noise (Association).
34
+
35
+ *** 5. The Merger (Governance Layer)
36
+ - **Role:** The final gatekeeper.
37
+ - **Tasks:** Deduplicates contacts, normalizes phone numbers (E.164), and filters out junk (e.g., social links to "twitter.com/share").
38
+
39
+ ** 3. Microservice Design
40
+
41
+ The system is split into two containers for scalability:
42
+
43
+ - **Extractor Service:** Stateless. Handles single requests. Good for scaling horizontally.
44
+ - **Job Service:** Stateful (in-memory or Redis). Manages long-running batches and tracks progress.
@@ -0,0 +1,75 @@
1
+ * ORGA Demo Walkthrough
2
+
3
+ This guide provides a step-by-step walkthrough for demonstrating ORGA to stakeholders or running it as a proof of concept.
4
+
5
+ ** 1. Setup (5 Minutes)
6
+
7
+ **Prerequisites:** Docker and Docker Compose.
8
+
9
+ 1. **Clone and Enter:**
10
+ #+BEGIN_SRC bash
11
+ git clone https://github.com/discretewater/orga.git
12
+ cd orga
13
+ #+END_SRC
14
+
15
+ 2. **Launch Services:**
16
+ #+BEGIN_SRC bash
17
+ docker compose up --build -d
18
+ #+END_SRC
19
+
20
+ 3. **Verify Health:**
21
+ #+BEGIN_SRC bash
22
+ curl localhost:8000/health
23
+ # Returns: {"status":"ok"}
24
+ #+END_SRC
25
+
26
+ ** 2. Scenario A: The "Real-Time" Lookup
27
+
28
+ *Goal:* Show how fast ORGA profiles a single site.
29
+
30
+ **Action:**
31
+ Run this command to profile the WHO:
32
+ #+BEGIN_SRC bash
33
+ curl -X POST "http://127.0.0.1:8000/extract" \
34
+ -H "Content-Type: application/json" \
35
+ -d '{"url": "https://www.who.int"}' | jq .
36
+ #+END_SRC
37
+
38
+ **Talking Points:**
39
+ - Point out the `org_type`: "InternationalOrg".
40
+ - Show the `social_links`: Clean, validated URLs.
41
+ - Highlight speed: "This took < 1 second."
42
+
43
+ ** 3. Scenario B: The "Batch Ingestion"
44
+
45
+ *Goal:* Demonstrate robustness and async processing.
46
+
47
+ **Action:**
48
+ 1. Submit a job with a mix of site types:
49
+ #+BEGIN_SRC bash
50
+ curl -X POST "http://127.0.0.1:8001/jobs" \
51
+ -H "Content-Type: application/json" \
52
+ -d '{ "urls": [
53
+ "https://www.harvard.edu",
54
+ "https://www.cheo.on.ca",
55
+ "https://www.greenpeace.org"
56
+ ]}'
57
+ #+END_SRC
58
+ *Copy the `job_id` returned.*
59
+
60
+ 2. Poll for results:
61
+ #+BEGIN_SRC bash
62
+ curl "http://127.0.0.1:8001/jobs/<YOUR_JOB_ID>" | jq .
63
+ #+END_SRC
64
+
65
+ **Talking Points:**
66
+ - ORGA handles concurrency automatically.
67
+ - It respects rate limits per host.
68
+ - It survives blocked pages without crashing the batch.
69
+
70
+ ** 4. Interpreting the "Black Box" (Debug Info)
71
+
72
+ In the JSON output, scroll down to `debug_info`.
73
+ - Show `classification_debug`.
74
+ - Explain how ORGA weighed the `/about` page higher than the `/news` page to determine the category.
75
+ - This proves the system is **deterministic**, not a "black box" AI.