app-classifier 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- app_classifier-0.1.0/LICENSE +21 -0
- app_classifier-0.1.0/PKG-INFO +250 -0
- app_classifier-0.1.0/README.md +218 -0
- app_classifier-0.1.0/pyproject.toml +68 -0
- app_classifier-0.1.0/setup.cfg +4 -0
- app_classifier-0.1.0/src/app_classifier/__init__.py +66 -0
- app_classifier-0.1.0/src/app_classifier/agent.py +662 -0
- app_classifier-0.1.0/src/app_classifier/classifier.py +802 -0
- app_classifier-0.1.0/src/app_classifier/cli.py +93 -0
- app_classifier-0.1.0/src/app_classifier/data/web_server_cves.json +75 -0
- app_classifier-0.1.0/src/app_classifier/hosting.py +664 -0
- app_classifier-0.1.0/src/app_classifier/web_server_vulns.py +151 -0
- app_classifier-0.1.0/src/app_classifier.egg-info/PKG-INFO +250 -0
- app_classifier-0.1.0/src/app_classifier.egg-info/SOURCES.txt +18 -0
- app_classifier-0.1.0/src/app_classifier.egg-info/dependency_links.txt +1 -0
- app_classifier-0.1.0/src/app_classifier.egg-info/entry_points.txt +2 -0
- app_classifier-0.1.0/src/app_classifier.egg-info/requires.txt +8 -0
- app_classifier-0.1.0/src/app_classifier.egg-info/top_level.txt +1 -0
- app_classifier-0.1.0/tests/test_agent.py +170 -0
- app_classifier-0.1.0/tests/test_classifier.py +190 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Codefixer contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: app-classifier
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Point it at a repo, get back 'this is an e-commerce app that does X' — pattern-based application functional-category inference from routes, data models, and README.
|
|
5
|
+
Author: Codefixer contributors
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/codefixer/app-classifier
|
|
8
|
+
Project-URL: Issues, https://github.com/codefixer/app-classifier/issues
|
|
9
|
+
Project-URL: Source, https://github.com/codefixer/app-classifier
|
|
10
|
+
Keywords: code-analysis,static-analysis,repository-classifier,code-understanding,documentation,hosting-requirements,devops,onboarding
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Software Development
|
|
18
|
+
Classifier: Topic :: Software Development :: Documentation
|
|
19
|
+
Classifier: Topic :: System :: Systems Administration
|
|
20
|
+
Classifier: Intended Audience :: Developers
|
|
21
|
+
Classifier: Intended Audience :: System Administrators
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Provides-Extra: test
|
|
26
|
+
Requires-Dist: pytest>=7.0; extra == "test"
|
|
27
|
+
Requires-Dist: pytest-cov>=4.0; extra == "test"
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
30
|
+
Requires-Dist: mypy>=1.0; extra == "dev"
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
|
|
33
|
+
# app-classifier
|
|
34
|
+
|
|
35
|
+
> Point it at a repo, get back **"this is an e-commerce app that does X"**.
|
|
36
|
+
|
|
37
|
+
[](https://www.python.org/downloads/)
|
|
38
|
+
[](https://opensource.org/licenses/MIT)
|
|
39
|
+
[]()
|
|
40
|
+
|
|
41
|
+
Pattern-based application functional-category inference from routes, data models, and README. **Zero runtime dependencies** (pure stdlib). Optional LLM polish — bring your own provider.
|
|
42
|
+
|
|
43
|
+
## What problem this solves
|
|
44
|
+
|
|
45
|
+
Onboarding to a new repo, every engineer asks the same questions: *"What does this thing do? Is it a CRUD app or a queue worker? What database? What ports does it need?"* The README is usually wrong or stale. Codeowners are unavailable. You end up grepping for clues.
|
|
46
|
+
|
|
47
|
+
`app-classifier` answers those questions in **under a second** for any repo, on disk, with no network calls:
|
|
48
|
+
|
|
49
|
+
- **What kind of app is this?** — e-commerce, blog, social network, admin panel, REST API, auth/SSO, file management, scheduling, or messaging (9 categories, weighted-pattern matching, confidence-scored)
|
|
50
|
+
- **What does it do?** — a 2-3 sentence functional description, deterministically composed, optionally LLM-polished
|
|
51
|
+
- **How does it deploy?** — runtime + version, framework, web server, databases, caches, ports, env vars, container base image, runtime CVEs
|
|
52
|
+
|
|
53
|
+
## Quick start
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
pip install app-classifier
|
|
57
|
+
app-classifier ./my-repo
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
```
|
|
61
|
+
=== my-repo ===
|
|
62
|
+
|
|
63
|
+
Category: e-commerce (78% confidence)
|
|
64
|
+
Runtime: python 3.11
|
|
65
|
+
Framework: FastAPI
|
|
66
|
+
Deploys as: ASGI server (uvicorn / hypercorn / daphne)
|
|
67
|
+
Databases: PostgreSQL, SQLAlchemy ORM
|
|
68
|
+
Cache/Queue: Redis, Celery
|
|
69
|
+
Features: online shopping, messaging
|
|
70
|
+
|
|
71
|
+
📋 Summary: my-repo · python 3.11 · FastAPI · 23 HTTP route(s) · 5 data model(s) · DB: PostgreSQL, SQLAlchemy ORM
|
|
72
|
+
|
|
73
|
+
📝 What it does:
|
|
74
|
+
my-repo is a e-commerce application. Primary functionality: online shopping, messaging.
|
|
75
|
+
It models entities like Cart, Order, Product, User serving authenticated users.
|
|
76
|
+
|
|
77
|
+
🌐 HTTP Routes (23 found):
|
|
78
|
+
GET /products → list_products
|
|
79
|
+
POST /cart/add → add_to_cart
|
|
80
|
+
POST /checkout → checkout
|
|
81
|
+
...
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Python API
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
from app_classifier import classify
|
|
88
|
+
|
|
89
|
+
result = classify("./my-repo")
|
|
90
|
+
|
|
91
|
+
print(result.app_category) # 'e-commerce'
|
|
92
|
+
print(result.app_category_confidence) # 0.78
|
|
93
|
+
print(result.detected_features) # ['online shopping', 'messaging']
|
|
94
|
+
print(result.functional_description) # "my-repo is a e-commerce application. ..."
|
|
95
|
+
|
|
96
|
+
# Full structured access
|
|
97
|
+
for route in result.routes:
|
|
98
|
+
print(route.method, route.path, route.handler)
|
|
99
|
+
|
|
100
|
+
for model in result.data_models:
|
|
101
|
+
print(model.name, model.framework, model.fields_hint)
|
|
102
|
+
|
|
103
|
+
# JSON-serializable
|
|
104
|
+
import json
|
|
105
|
+
print(json.dumps(result.to_dict(), indent=2))
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## Just the deployment data?
|
|
109
|
+
|
|
110
|
+
Skip the classifier, use `hosting` directly:
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
from app_classifier import analyze_hosting_requirements
|
|
114
|
+
|
|
115
|
+
report = analyze_hosting_requirements("./my-repo")
|
|
116
|
+
print(report.runtime) # {'language': 'python', 'version': '3.11'}
|
|
117
|
+
print(report.web_server) # {'framework': 'FastAPI', 'deployment_target': '...'}
|
|
118
|
+
print(report.databases) # [{'name': 'PostgreSQL', ...}, ...]
|
|
119
|
+
print(report.ports) # [{'port': 8000, 'source': 'Dockerfile', ...}]
|
|
120
|
+
print(report.web_server_vulnerabilities) # CVEs on the container base image
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## Optional: LLM polish
|
|
124
|
+
|
|
125
|
+
`classify_async` accepts ANY async callable as the LLM provider — no SDK pinned. If the LLM gives a useful response, the deterministic `functional_description` is replaced with the polished version; on any failure (timeout / parse error / hallucination guard / no provider) the deterministic version is kept.
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
# OpenAI shim
|
|
129
|
+
async def my_openai_provider(prompt, max_tokens=400, temperature=0.2):
|
|
130
|
+
import openai
|
|
131
|
+
client = openai.AsyncOpenAI()
|
|
132
|
+
resp = await client.chat.completions.create(
|
|
133
|
+
model="gpt-4o-mini",
|
|
134
|
+
messages=[{"role": "user", "content": prompt}],
|
|
135
|
+
max_tokens=max_tokens, temperature=temperature,
|
|
136
|
+
)
|
|
137
|
+
return resp.choices[0].message.content
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
# Anthropic shim
|
|
141
|
+
async def my_anthropic_provider(prompt, max_tokens=400, temperature=0.2):
|
|
142
|
+
import anthropic
|
|
143
|
+
client = anthropic.AsyncAnthropic()
|
|
144
|
+
resp = await client.messages.create(
|
|
145
|
+
model="claude-haiku-4-5",
|
|
146
|
+
max_tokens=max_tokens, temperature=temperature,
|
|
147
|
+
messages=[{"role": "user", "content": prompt}],
|
|
148
|
+
)
|
|
149
|
+
return resp.content[0].text
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# Local llama.cpp / Ollama shim
|
|
153
|
+
async def my_ollama_provider(prompt, max_tokens=400, temperature=0.2):
|
|
154
|
+
import httpx
|
|
155
|
+
async with httpx.AsyncClient() as client:
|
|
156
|
+
r = await client.post("http://localhost:11434/api/generate", json={
|
|
157
|
+
"model": "llama3", "prompt": prompt, "stream": False,
|
|
158
|
+
"options": {"num_predict": max_tokens, "temperature": temperature},
|
|
159
|
+
})
|
|
160
|
+
return r.json().get("response")
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
# Use any of the above
|
|
164
|
+
import asyncio
|
|
165
|
+
from app_classifier import classify_async
|
|
166
|
+
result = asyncio.run(classify_async("./my-repo", llm_provider=my_openai_provider))
|
|
167
|
+
print(result.functional_description)
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
## What detection is supported
|
|
171
|
+
|
|
172
|
+
### Runtimes
|
|
173
|
+
Python, Java (JDK 8+), Node.js, Go, Ruby, PHP, Rust — detected from manifest files, Dockerfiles, version files (`.nvmrc`, `.python-version`, `.ruby-version`).
|
|
174
|
+
|
|
175
|
+
### Web frameworks (route extraction)
|
|
176
|
+
| Language | Frameworks |
|
|
177
|
+
|---|---|
|
|
178
|
+
| Python | Flask, FastAPI, Django |
|
|
179
|
+
| Java | Spring Boot, Struts 2 (struts.xml), classic Spring |
|
|
180
|
+
| Node | Express, Fastify, NestJS, Next.js |
|
|
181
|
+
|
|
182
|
+
### Data model ORMs
|
|
183
|
+
| ORM | Detected from |
|
|
184
|
+
|---|---|
|
|
185
|
+
| JPA / Hibernate | `@Entity`, `@Table` annotations |
|
|
186
|
+
| SQLAlchemy | `class X(Base)` |
|
|
187
|
+
| Django ORM | `class X(models.Model)` |
|
|
188
|
+
|
|
189
|
+
### Databases / caches
|
|
190
|
+
PostgreSQL, MySQL, MongoDB, H2, Oracle, SQL Server, MariaDB, Redis, RabbitMQ, Kafka, Elasticsearch, Celery.
|
|
191
|
+
|
|
192
|
+
### Container/deployment
|
|
193
|
+
Dockerfile (`FROM`, `EXPOSE`, `ENV`), docker-compose, Kubernetes manifests, Helm charts, k8s deployment YAML, Heroku Procfile, Vercel / Netlify configs.
|
|
194
|
+
|
|
195
|
+
### Runtime CVEs (web-server vulnerabilities)
|
|
196
|
+
Curated CVE manifest for nginx, Apache HTTPD, Tomcat, OpenJDK / Eclipse Temurin / Amazon Corretto. ~30 high-impact CVEs covered out of the box. PRs welcome.
|
|
197
|
+
|
|
198
|
+
### App categories (functional fingerprints)
|
|
199
|
+
e-commerce, blog/content, social network, admin panel/dashboard, REST API service, authentication/SSO, file/document management, scheduling/booking, messaging/notification. Each is matched by a weighted regex pattern against routes + model names + README.
|
|
200
|
+
|
|
201
|
+
## How it works
|
|
202
|
+
|
|
203
|
+
1. Walk every manifest/config file in the repo (capped at 800 files for speed)
|
|
204
|
+
2. Each file extracts language-specific signals (Maven artifact IDs, npm package names, Python deps, Dockerfile FROM, k8s containerPort, etc.) → `HostingReport`
|
|
205
|
+
3. Walk source files to extract HTTP routes + data models per framework
|
|
206
|
+
4. Pattern-match routes + model names + README purpose against 9 category fingerprints (weighted regex)
|
|
207
|
+
5. Compose the 2-3 sentence functional description deterministically
|
|
208
|
+
6. (Optional) Hand the structured signals to your LLM for a polished rewrite
|
|
209
|
+
|
|
210
|
+
**Time budget:** under 1 second on a 5K-file repo. Bounded scan caps file count + per-file read size.
|
|
211
|
+
|
|
212
|
+
## Design principles
|
|
213
|
+
|
|
214
|
+
- **No network**. Every signal comes from on-disk content. Bundled CVE manifest, no live API calls.
|
|
215
|
+
- **No SDK pin**. The LLM step is provider-agnostic — bring your own callable. We never `import openai`.
|
|
216
|
+
- **No surprises**. Failures on individual files don't kill the pass. Confidence is always reported; the consumer decides whether to trust it.
|
|
217
|
+
- **Pure read**. We never modify the target repo.
|
|
218
|
+
|
|
219
|
+
## Contributing
|
|
220
|
+
|
|
221
|
+
PRs welcome on three axes:
|
|
222
|
+
|
|
223
|
+
1. **More category fingerprints** — `_CATEGORY_FINGERPRINTS` in `classifier.py`. Each is `{ name, feature_label, signals: [(regex, weight), ...] }`.
|
|
224
|
+
2. **More CVE entries** — `data/web_server_cves.json`. Schema is documented in the file header.
|
|
225
|
+
3. **More framework extractors** — route + model extraction for Ruby on Rails, Phoenix, ASP.NET Core, Gin, Rocket, etc. would all be welcome.
|
|
226
|
+
|
|
227
|
+
### Run the test suite
|
|
228
|
+
|
|
229
|
+
```bash
|
|
230
|
+
pip install -e ".[test]"
|
|
231
|
+
pytest
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
Tests use fixture directories under `tests/fixtures/` — point the classifier at each, assert the expected category + features.
|
|
235
|
+
|
|
236
|
+
## What this is NOT
|
|
237
|
+
|
|
238
|
+
- **Not a security scanner.** It surfaces runtime CVEs on the container base image, but the rest of the code is for understanding, not vulnerability detection.
|
|
239
|
+
- **Not a deployment tool.** It tells you what the deployment looks like; it doesn't deploy anything.
|
|
240
|
+
- **Not a replacement for a README.** It generates a structural sketch; humans still write the narrative.
|
|
241
|
+
|
|
242
|
+
If you want a full security analysis + fix pipeline that uses this internally, see [Codefixer](https://codefixer.ai) (closed-source).
|
|
243
|
+
|
|
244
|
+
## License
|
|
245
|
+
|
|
246
|
+
MIT — see [LICENSE](LICENSE). Use it however you want. Attribution appreciated but not required.
|
|
247
|
+
|
|
248
|
+
## Acknowledgements
|
|
249
|
+
|
|
250
|
+
Extracted from Codefixer's `hosting_requirements` + `app_description` analyzers. The category-fingerprint approach was inspired by Sourcegraph's "what is this repo?" tooling and the way Backstage classifies services.
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
# app-classifier
|
|
2
|
+
|
|
3
|
+
> Point it at a repo, get back **"this is an e-commerce app that does X"**.
|
|
4
|
+
|
|
5
|
+
[](https://www.python.org/downloads/)
|
|
6
|
+
[](https://opensource.org/licenses/MIT)
|
|
7
|
+
[]()
|
|
8
|
+
|
|
9
|
+
Pattern-based application functional-category inference from routes, data models, and README. **Zero runtime dependencies** (pure stdlib). Optional LLM polish — bring your own provider.
|
|
10
|
+
|
|
11
|
+
## What problem this solves
|
|
12
|
+
|
|
13
|
+
Onboarding to a new repo, every engineer asks the same questions: *"What does this thing do? Is it a CRUD app or a queue worker? What database? What ports does it need?"* The README is usually wrong or stale. Codeowners are unavailable. You end up grepping for clues.
|
|
14
|
+
|
|
15
|
+
`app-classifier` answers those questions in **under a second** for any repo, on disk, with no network calls:
|
|
16
|
+
|
|
17
|
+
- **What kind of app is this?** — e-commerce, blog, social network, admin panel, REST API, auth/SSO, file management, scheduling, or messaging (9 categories, weighted-pattern matching, confidence-scored)
|
|
18
|
+
- **What does it do?** — a 2-3 sentence functional description, deterministically composed, optionally LLM-polished
|
|
19
|
+
- **How does it deploy?** — runtime + version, framework, web server, databases, caches, ports, env vars, container base image, runtime CVEs
|
|
20
|
+
|
|
21
|
+
## Quick start
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install app-classifier
|
|
25
|
+
app-classifier ./my-repo
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
```
|
|
29
|
+
=== my-repo ===
|
|
30
|
+
|
|
31
|
+
Category: e-commerce (78% confidence)
|
|
32
|
+
Runtime: python 3.11
|
|
33
|
+
Framework: FastAPI
|
|
34
|
+
Deploys as: ASGI server (uvicorn / hypercorn / daphne)
|
|
35
|
+
Databases: PostgreSQL, SQLAlchemy ORM
|
|
36
|
+
Cache/Queue: Redis, Celery
|
|
37
|
+
Features: online shopping, messaging
|
|
38
|
+
|
|
39
|
+
📋 Summary: my-repo · python 3.11 · FastAPI · 23 HTTP route(s) · 5 data model(s) · DB: PostgreSQL, SQLAlchemy ORM
|
|
40
|
+
|
|
41
|
+
📝 What it does:
|
|
42
|
+
my-repo is a e-commerce application. Primary functionality: online shopping, messaging.
|
|
43
|
+
It models entities like Cart, Order, Product, User serving authenticated users.
|
|
44
|
+
|
|
45
|
+
🌐 HTTP Routes (23 found):
|
|
46
|
+
GET /products → list_products
|
|
47
|
+
POST /cart/add → add_to_cart
|
|
48
|
+
POST /checkout → checkout
|
|
49
|
+
...
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Python API
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
from app_classifier import classify
|
|
56
|
+
|
|
57
|
+
result = classify("./my-repo")
|
|
58
|
+
|
|
59
|
+
print(result.app_category) # 'e-commerce'
|
|
60
|
+
print(result.app_category_confidence) # 0.78
|
|
61
|
+
print(result.detected_features) # ['online shopping', 'messaging']
|
|
62
|
+
print(result.functional_description) # "my-repo is a e-commerce application. ..."
|
|
63
|
+
|
|
64
|
+
# Full structured access
|
|
65
|
+
for route in result.routes:
|
|
66
|
+
print(route.method, route.path, route.handler)
|
|
67
|
+
|
|
68
|
+
for model in result.data_models:
|
|
69
|
+
print(model.name, model.framework, model.fields_hint)
|
|
70
|
+
|
|
71
|
+
# JSON-serializable
|
|
72
|
+
import json
|
|
73
|
+
print(json.dumps(result.to_dict(), indent=2))
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Just the deployment data?
|
|
77
|
+
|
|
78
|
+
Skip the classifier, use `hosting` directly:
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from app_classifier import analyze_hosting_requirements
|
|
82
|
+
|
|
83
|
+
report = analyze_hosting_requirements("./my-repo")
|
|
84
|
+
print(report.runtime) # {'language': 'python', 'version': '3.11'}
|
|
85
|
+
print(report.web_server) # {'framework': 'FastAPI', 'deployment_target': '...'}
|
|
86
|
+
print(report.databases) # [{'name': 'PostgreSQL', ...}, ...]
|
|
87
|
+
print(report.ports) # [{'port': 8000, 'source': 'Dockerfile', ...}]
|
|
88
|
+
print(report.web_server_vulnerabilities) # CVEs on the container base image
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## Optional: LLM polish
|
|
92
|
+
|
|
93
|
+
`classify_async` accepts ANY async callable as the LLM provider — no SDK pinned. If the LLM gives a useful response, the deterministic `functional_description` is replaced with the polished version; on any failure (timeout / parse error / hallucination guard / no provider) the deterministic version is kept.
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
# OpenAI shim
|
|
97
|
+
async def my_openai_provider(prompt, max_tokens=400, temperature=0.2):
|
|
98
|
+
import openai
|
|
99
|
+
client = openai.AsyncOpenAI()
|
|
100
|
+
resp = await client.chat.completions.create(
|
|
101
|
+
model="gpt-4o-mini",
|
|
102
|
+
messages=[{"role": "user", "content": prompt}],
|
|
103
|
+
max_tokens=max_tokens, temperature=temperature,
|
|
104
|
+
)
|
|
105
|
+
return resp.choices[0].message.content
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
# Anthropic shim
|
|
109
|
+
async def my_anthropic_provider(prompt, max_tokens=400, temperature=0.2):
|
|
110
|
+
import anthropic
|
|
111
|
+
client = anthropic.AsyncAnthropic()
|
|
112
|
+
resp = await client.messages.create(
|
|
113
|
+
model="claude-haiku-4-5",
|
|
114
|
+
max_tokens=max_tokens, temperature=temperature,
|
|
115
|
+
messages=[{"role": "user", "content": prompt}],
|
|
116
|
+
)
|
|
117
|
+
return resp.content[0].text
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
# Local llama.cpp / Ollama shim
|
|
121
|
+
async def my_ollama_provider(prompt, max_tokens=400, temperature=0.2):
|
|
122
|
+
import httpx
|
|
123
|
+
async with httpx.AsyncClient() as client:
|
|
124
|
+
r = await client.post("http://localhost:11434/api/generate", json={
|
|
125
|
+
"model": "llama3", "prompt": prompt, "stream": False,
|
|
126
|
+
"options": {"num_predict": max_tokens, "temperature": temperature},
|
|
127
|
+
})
|
|
128
|
+
return r.json().get("response")
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
# Use any of the above
|
|
132
|
+
import asyncio
|
|
133
|
+
from app_classifier import classify_async
|
|
134
|
+
result = asyncio.run(classify_async("./my-repo", llm_provider=my_openai_provider))
|
|
135
|
+
print(result.functional_description)
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
## What detection is supported
|
|
139
|
+
|
|
140
|
+
### Runtimes
|
|
141
|
+
Python, Java (JDK 8+), Node.js, Go, Ruby, PHP, Rust — detected from manifest files, Dockerfiles, version files (`.nvmrc`, `.python-version`, `.ruby-version`).
|
|
142
|
+
|
|
143
|
+
### Web frameworks (route extraction)
|
|
144
|
+
| Language | Frameworks |
|
|
145
|
+
|---|---|
|
|
146
|
+
| Python | Flask, FastAPI, Django |
|
|
147
|
+
| Java | Spring Boot, Struts 2 (struts.xml), classic Spring |
|
|
148
|
+
| Node | Express, Fastify, NestJS, Next.js |
|
|
149
|
+
|
|
150
|
+
### Data model ORMs
|
|
151
|
+
| ORM | Detected from |
|
|
152
|
+
|---|---|
|
|
153
|
+
| JPA / Hibernate | `@Entity`, `@Table` annotations |
|
|
154
|
+
| SQLAlchemy | `class X(Base)` |
|
|
155
|
+
| Django ORM | `class X(models.Model)` |
|
|
156
|
+
|
|
157
|
+
### Databases / caches
|
|
158
|
+
PostgreSQL, MySQL, MongoDB, H2, Oracle, SQL Server, MariaDB, Redis, RabbitMQ, Kafka, Elasticsearch, Celery.
|
|
159
|
+
|
|
160
|
+
### Container/deployment
|
|
161
|
+
Dockerfile (`FROM`, `EXPOSE`, `ENV`), docker-compose, Kubernetes manifests, Helm charts, k8s deployment YAML, Heroku Procfile, Vercel / Netlify configs.
|
|
162
|
+
|
|
163
|
+
### Runtime CVEs (web-server vulnerabilities)
|
|
164
|
+
Curated CVE manifest for nginx, Apache HTTPD, Tomcat, OpenJDK / Eclipse Temurin / Amazon Corretto. ~30 high-impact CVEs covered out of the box. PRs welcome.
|
|
165
|
+
|
|
166
|
+
### App categories (functional fingerprints)
|
|
167
|
+
e-commerce, blog/content, social network, admin panel/dashboard, REST API service, authentication/SSO, file/document management, scheduling/booking, messaging/notification. Each is matched by a weighted regex pattern against routes + model names + README.
|
|
168
|
+
|
|
169
|
+
## How it works
|
|
170
|
+
|
|
171
|
+
1. Walk every manifest/config file in the repo (capped at 800 files for speed)
|
|
172
|
+
2. Each file extracts language-specific signals (Maven artifact IDs, npm package names, Python deps, Dockerfile FROM, k8s containerPort, etc.) → `HostingReport`
|
|
173
|
+
3. Walk source files to extract HTTP routes + data models per framework
|
|
174
|
+
4. Pattern-match routes + model names + README purpose against 9 category fingerprints (weighted regex)
|
|
175
|
+
5. Compose the 2-3 sentence functional description deterministically
|
|
176
|
+
6. (Optional) Hand the structured signals to your LLM for a polished rewrite
|
|
177
|
+
|
|
178
|
+
**Time budget:** under 1 second on a 5K-file repo. Bounded scan caps file count + per-file read size.
|
|
179
|
+
|
|
180
|
+
## Design principles
|
|
181
|
+
|
|
182
|
+
- **No network**. Every signal comes from on-disk content. Bundled CVE manifest, no live API calls.
|
|
183
|
+
- **No SDK pin**. The LLM step is provider-agnostic — bring your own callable. We never `import openai`.
|
|
184
|
+
- **No surprises**. Failures on individual files don't kill the pass. Confidence is always reported; the consumer decides whether to trust it.
|
|
185
|
+
- **Pure read**. We never modify the target repo.
|
|
186
|
+
|
|
187
|
+
## Contributing
|
|
188
|
+
|
|
189
|
+
PRs welcome on three axes:
|
|
190
|
+
|
|
191
|
+
1. **More category fingerprints** — `_CATEGORY_FINGERPRINTS` in `classifier.py`. Each is `{ name, feature_label, signals: [(regex, weight), ...] }`.
|
|
192
|
+
2. **More CVE entries** — `data/web_server_cves.json`. Schema is documented in the file header.
|
|
193
|
+
3. **More framework extractors** — route + model extraction for Ruby on Rails, Phoenix, ASP.NET Core, Gin, Rocket, etc. would all be welcome.
|
|
194
|
+
|
|
195
|
+
### Run the test suite
|
|
196
|
+
|
|
197
|
+
```bash
|
|
198
|
+
pip install -e ".[test]"
|
|
199
|
+
pytest
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
Tests use fixture directories under `tests/fixtures/` — point the classifier at each, assert the expected category + features.
|
|
203
|
+
|
|
204
|
+
## What this is NOT
|
|
205
|
+
|
|
206
|
+
- **Not a security scanner.** It surfaces runtime CVEs on the container base image, but the rest of the code is for understanding, not vulnerability detection.
|
|
207
|
+
- **Not a deployment tool.** It tells you what the deployment looks like; it doesn't deploy anything.
|
|
208
|
+
- **Not a replacement for a README.** It generates a structural sketch; humans still write the narrative.
|
|
209
|
+
|
|
210
|
+
If you want a full security analysis + fix pipeline that uses this internally, see [Codefixer](https://codefixer.ai) (closed-source).
|
|
211
|
+
|
|
212
|
+
## License
|
|
213
|
+
|
|
214
|
+
MIT — see [LICENSE](LICENSE). Use it however you want. Attribution appreciated but not required.
|
|
215
|
+
|
|
216
|
+
## Acknowledgements
|
|
217
|
+
|
|
218
|
+
Extracted from Codefixer's `hosting_requirements` + `app_description` analyzers. The category-fingerprint approach was inspired by Sourcegraph's "what is this repo?" tooling and the way Backstage classifies services.
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=64", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "app-classifier"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Point it at a repo, get back 'this is an e-commerce app that does X' — pattern-based application functional-category inference from routes, data models, and README."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { text = "MIT" }
|
|
11
|
+
authors = [
|
|
12
|
+
{ name = "Codefixer contributors" },
|
|
13
|
+
]
|
|
14
|
+
keywords = [
|
|
15
|
+
"code-analysis", "static-analysis", "repository-classifier",
|
|
16
|
+
"code-understanding", "documentation", "hosting-requirements",
|
|
17
|
+
"devops", "onboarding",
|
|
18
|
+
]
|
|
19
|
+
classifiers = [
|
|
20
|
+
"Development Status :: 4 - Beta",
|
|
21
|
+
"License :: OSI Approved :: MIT License",
|
|
22
|
+
"Programming Language :: Python :: 3",
|
|
23
|
+
"Programming Language :: Python :: 3.10",
|
|
24
|
+
"Programming Language :: Python :: 3.11",
|
|
25
|
+
"Programming Language :: Python :: 3.12",
|
|
26
|
+
"Topic :: Software Development",
|
|
27
|
+
"Topic :: Software Development :: Documentation",
|
|
28
|
+
"Topic :: System :: Systems Administration",
|
|
29
|
+
"Intended Audience :: Developers",
|
|
30
|
+
"Intended Audience :: System Administrators",
|
|
31
|
+
]
|
|
32
|
+
requires-python = ">=3.10"
|
|
33
|
+
# Pure stdlib. The optional LLM enrichment is provider-agnostic — users
|
|
34
|
+
# pass their own callable, so we don't pin OpenAI/Anthropic SDKs here.
|
|
35
|
+
dependencies = []
|
|
36
|
+
|
|
37
|
+
[project.optional-dependencies]
|
|
38
|
+
test = [
|
|
39
|
+
"pytest>=7.0",
|
|
40
|
+
"pytest-cov>=4.0",
|
|
41
|
+
]
|
|
42
|
+
dev = [
|
|
43
|
+
"ruff>=0.1.0",
|
|
44
|
+
"mypy>=1.0",
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
[project.urls]
|
|
48
|
+
Homepage = "https://github.com/codefixer/app-classifier"
|
|
49
|
+
Issues = "https://github.com/codefixer/app-classifier/issues"
|
|
50
|
+
Source = "https://github.com/codefixer/app-classifier"
|
|
51
|
+
|
|
52
|
+
[project.scripts]
|
|
53
|
+
app-classifier = "app_classifier.cli:main"
|
|
54
|
+
|
|
55
|
+
[tool.setuptools.packages.find]
|
|
56
|
+
where = ["src"]
|
|
57
|
+
|
|
58
|
+
[tool.setuptools.package-data]
|
|
59
|
+
app_classifier = ["data/*.json"]
|
|
60
|
+
|
|
61
|
+
[tool.pytest.ini_options]
|
|
62
|
+
testpaths = ["tests"]
|
|
63
|
+
python_files = ["test_*.py"]
|
|
64
|
+
addopts = "-q --tb=short"
|
|
65
|
+
|
|
66
|
+
[tool.ruff]
|
|
67
|
+
line-length = 100
|
|
68
|
+
target-version = "py310"
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""app-classifier — pattern-based application functional-category inference.
|
|
2
|
+
|
|
3
|
+
Point it at a repo on disk, get back:
|
|
4
|
+
- what kind of app it is (e-commerce, blog, admin panel, REST API, etc.)
|
|
5
|
+
- a 2-3 sentence functional description
|
|
6
|
+
- hosting requirements (runtime, web server, databases, ports, env vars)
|
|
7
|
+
- detected HTTP routes + data models
|
|
8
|
+
- optional LLM-refined description (provider-agnostic — bring your own)
|
|
9
|
+
|
|
10
|
+
Quick start:
|
|
11
|
+
>>> from app_classifier import classify
|
|
12
|
+
>>> result = classify("./my-repo")
|
|
13
|
+
>>> print(result.summary)
|
|
14
|
+
'my-repo · python 3.11 · FastAPI · 23 HTTP route(s) · 5 data model(s) · DB: PostgreSQL'
|
|
15
|
+
>>> print(result.app_category, result.app_category_confidence)
|
|
16
|
+
'e-commerce' 0.78
|
|
17
|
+
>>> print(result.functional_description)
|
|
18
|
+
'my-repo is an e-commerce application. Primary functionality: online shopping. ...'
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from app_classifier.classifier import (
|
|
22
|
+
AppDescription,
|
|
23
|
+
DataModel,
|
|
24
|
+
RouteEntry,
|
|
25
|
+
classify,
|
|
26
|
+
classify_async,
|
|
27
|
+
describe_app, # alias for classify
|
|
28
|
+
llm_enrich_description,
|
|
29
|
+
)
|
|
30
|
+
from app_classifier.hosting import (
|
|
31
|
+
HostingReport,
|
|
32
|
+
Signal,
|
|
33
|
+
analyze_hosting_requirements,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
__version__ = "0.1.0"
|
|
37
|
+
|
|
38
|
+
__all__ = [
|
|
39
|
+
# Primary API
|
|
40
|
+
"classify",
|
|
41
|
+
"classify_async",
|
|
42
|
+
"describe_app",
|
|
43
|
+
"AppDescription",
|
|
44
|
+
"RouteEntry",
|
|
45
|
+
"DataModel",
|
|
46
|
+
# Hosting subsystem (re-exported for users who only want deployment data)
|
|
47
|
+
"analyze_hosting_requirements",
|
|
48
|
+
"HostingReport",
|
|
49
|
+
"Signal",
|
|
50
|
+
# LLM enrichment hook
|
|
51
|
+
"llm_enrich_description",
|
|
52
|
+
# Agentic mode — opt-in, requires an LLM provider
|
|
53
|
+
"classify_agentic",
|
|
54
|
+
"AgentClassificationResult",
|
|
55
|
+
"AgentStep",
|
|
56
|
+
"SubappClassification",
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
# Agentic API (imported lazily to keep `import app_classifier` cheap for
|
|
60
|
+
# users who only need the deterministic path)
|
|
61
|
+
from app_classifier.agent import (
|
|
62
|
+
AgentClassificationResult,
|
|
63
|
+
AgentStep,
|
|
64
|
+
SubappClassification,
|
|
65
|
+
classify_agentic,
|
|
66
|
+
)
|