folder-classifier 0.3.4__tar.gz → 0.3.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {folder_classifier-0.3.4 → folder_classifier-0.3.6}/PKG-INFO +5 -1
- folder_classifier-0.3.6/README.md +5 -0
- {folder_classifier-0.3.4 → folder_classifier-0.3.6}/folder_classifier/app.py +1 -1
- {folder_classifier-0.3.4 → folder_classifier-0.3.6}/folder_classifier/classifier.py +29 -31
- {folder_classifier-0.3.4 → folder_classifier-0.3.6}/folder_classifier.egg-info/PKG-INFO +5 -1
- {folder_classifier-0.3.4 → folder_classifier-0.3.6}/setup.cfg +1 -1
- folder_classifier-0.3.4/README.md +0 -1
- {folder_classifier-0.3.4 → folder_classifier-0.3.6}/folder_classifier/__init__.py +0 -0
- {folder_classifier-0.3.4 → folder_classifier-0.3.6}/folder_classifier/deploy.py +0 -0
- {folder_classifier-0.3.4 → folder_classifier-0.3.6}/folder_classifier/dto.py +0 -0
- {folder_classifier-0.3.4 → folder_classifier-0.3.6}/folder_classifier/util.py +0 -0
- {folder_classifier-0.3.4 → folder_classifier-0.3.6}/folder_classifier.egg-info/SOURCES.txt +0 -0
- {folder_classifier-0.3.4 → folder_classifier-0.3.6}/folder_classifier.egg-info/dependency_links.txt +0 -0
- {folder_classifier-0.3.4 → folder_classifier-0.3.6}/folder_classifier.egg-info/top_level.txt +0 -0
- {folder_classifier-0.3.4 → folder_classifier-0.3.6}/pyproject.toml +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: folder-classifier
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.6
|
4
4
|
Summary: Deploy folder classifier API to a Ray cluster
|
5
5
|
Author: Crispin Almodovar
|
6
6
|
Author-email:
|
@@ -11,3 +11,7 @@ Requires-Python: >=3.12
|
|
11
11
|
Description-Content-Type: text/markdown
|
12
12
|
|
13
13
|
# Folder classifier API
|
14
|
+
|
15
|
+
Implements a REST API for that calls an LLM-based classifier model hosted in Ray serve.
|
16
|
+
|
17
|
+
See [Build.md](Build.md) for build and deployment instructions.
|
@@ -17,7 +17,7 @@ class FolderClassifierAPI:
|
|
17
17
|
logging.basicConfig(level=logging.INFO)
|
18
18
|
self.logger = logging.getLogger(__name__)
|
19
19
|
self.classifier = FolderClassifier(app_name=model_config.app_name, deployment=model_config.deployment, model=model_config.model)
|
20
|
-
self.logger.info(f"Successfully initialized Folder Classifier using config: {model_config}")
|
20
|
+
self.logger.info(f"Successfully initialized Folder Classifier API using config: {model_config}")
|
21
21
|
|
22
22
|
@web_api.post("/predict")
|
23
23
|
async def predict(self, request: FolderClassificationRequest) -> FolderClassificationResponse:
|
@@ -1,28 +1,20 @@
|
|
1
1
|
import json
|
2
2
|
import logging
|
3
|
-
import os
|
4
3
|
from typing import Tuple, Dict, Any
|
5
4
|
|
6
5
|
from ray import serve
|
7
|
-
from ray.serve.handle import DeploymentHandle
|
8
6
|
|
9
7
|
from folder_classifier.dto import FolderClassificationRequest, FolderClassification
|
10
8
|
from folder_classifier.util import build_folder, render_tree
|
11
|
-
from openai import AsyncOpenAI
|
12
|
-
|
13
|
-
|
14
|
-
OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", "https://internal-multimodal-services.dev.cortoaws.com/serve/qwen3-4b-classifier/v1")
|
15
|
-
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "DUMMY_KEY")
|
16
|
-
MODEL = os.getenv("MODEL", "Qwen3-4B-Instruct-2507-classifier-FP8")
|
17
9
|
|
18
10
|
SYSTEM_PROMPT = """
|
19
11
|
You are a strict text classifier.
|
20
12
|
Output MUST be a single JSON object with exactly two keys: "category" and "reasoning".
|
21
13
|
- "category" ∈ {"matter","other"} (lowercase).
|
22
|
-
- "reasoning" ≤ 30 words.
|
14
|
+
- "reasoning" ≤ 30 words, citing rule IDs (e.g., RC1+R7 or R1+R3) and up to two brief evidence tokens.
|
23
15
|
- Do not include any double-quote (") characters inside the value of "reasoning"; use single quotes ' instead.
|
24
16
|
- No backticks, no code fences, no extra text. Return one JSON object only.
|
25
|
-
If unsure, choose "other".
|
17
|
+
No step-by-step thinking. No extra keys. If unsure, choose "other".
|
26
18
|
""".strip()
|
27
19
|
|
28
20
|
USER_PROMPT_TEMPLATE = """
|
@@ -36,21 +28,26 @@ Definitions
|
|
36
28
|
(c) legal-work indicators are absent, or
|
37
29
|
(d) there are zero files with extensions anywhere in the tree, or
|
38
30
|
(e) contents are exclusively non-legal domains (finance/accounting/IT/admin) with no legal-work indicators, or
|
39
|
-
(f) the ROOT NAME is
|
31
|
+
(f) the ROOT NAME is non-matter-specific, including either:
|
32
|
+
(f1) system/provenance roots (e.g., “CORTO Generated”, “Exports”, “Uploads”, “Dropbox Shared”, “OneDrive”, “SharePoint”, “Archive”, “Backup”, “Temp”, “Incoming”, “Outbox”, “Bulk Import”), or
|
33
|
+
(f2) informal/placeholder roots lacking client/company/matter cues (e.g., “hello”, “test”, “misc”, “docs”, “files”, “images”, “new folder”, “untitled”, very short single tokens with no digits).
|
40
34
|
|
41
35
|
Decision Rules (apply in order; case-insensitive)
|
42
|
-
RC_container: Any indication the FOLDER TREE holds more than one distinct matter
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
36
|
+
RC_container: Any indication the FOLDER TREE holds more than one distinct matter → category=other.
|
37
|
+
• Signals: ≥2 top-level subfolders with matter-like names (e.g., “<number> - <id>”, “<client> - <topic>”), even if those subfolders are empty; repetitive top-level matter-number patterns.
|
38
|
+
R0_subfolder: ROOT NAME equals a common matter subfolder/stage/type → category=other.
|
39
|
+
R7_files_present: Must contain ≥1 file with an extension (e.g., .pdf, .docx) anywhere; if none → category=other.
|
40
|
+
R8_nonlegal_only: Strong non-legal domain signals and NO legal-work indicators (R2–R4 or R6) → other.
|
41
|
+
• Finance/Accounting: BAS/IAS/GST/Tax Return, PAYG, payroll/timesheets/payslips, superannuation, Xero/MYOB/QuickBooks exports, GL/Trial Balance/Journals, bank statements, invoices/receipts, reconciliations, aged AR/AP, debtor/creditor lists, debtor collection schedules, AR “demand/reminder” letters in collections context.
|
42
|
+
• IT/Systems: backups, logs, source code, Git/DevOps, server/network/VPN, mailboxes, workspace/365 admin.
|
43
|
+
• Admin-only: generic receipts/vendor invoices/expenses without legal context.
|
44
|
+
R9_non_matter_specific_rootname: Non-matter-specific ROOT NAME → category=other.
|
45
|
+
• R9a_system_provenance: generated/export/import/shared/archive/backup/temp/uploads/downloads/platform names (Dropbox/OneDrive/SharePoint, etc.).
|
46
|
+
• R9b_placeholder_informal: informal/placeholder words with no client/company/matter cues (“hello”, “test”, “misc”, “docs”, “files”, “new folder”, “untitled”, short tokens without digits).
|
47
|
+
R10_legal_filename_semantics: If files exist but **no filename** in the tree is legal-sounding (and R2–R4/R6 are absent) → category=other.
|
48
|
+
• Legal-sounding filename cues (non-exhaustive): agreement/contract/deed/will/affidavit/statement/brief/advice/retainer/engagement/costs disclosure/pleadings/statement of claim/defence/court form/subpoena/orders/judgment/undertaking/notice/settlement/consent orders/particulars/summons/witness statement/statutory declaration.
|
49
|
+
• Generic/non-informative basenames (e.g., “test.pdf”, “doc1.pdf”, “file.pdf”, “notes.pdf”, “scan.pdf”, “image001.pdf”) **do not** count as legal-sounding.
|
50
|
+
R1_rootname: ROOT NAME resembles a single matter/container (matter/case/file number; client/surname/company; or a combination like “12345 Smith”, “Smith – Contract Dispute”, “Brown – Lease Review”).
|
54
51
|
R2_initial_docs: Early-stage matter docs (cost agreement/disclosure, retainer/engagement, intake/onboarding).
|
55
52
|
R3_legal_docs: Legal document types (agreement, contract, deed, will, affidavit, statement, advice, brief, pleadings, court forms, subpoena, orders, judgment, undertaking, notice of appeal, docket/case forms).
|
56
53
|
R4_legal_subfolders: Typical legal subfolders (correspondence/emails, file notes/attendance notes, searches/certificates, court documents/evidence/disclosure/discovery, drafts/final/signed/executed, billing/invoices/time records).
|
@@ -62,16 +59,17 @@ Decision
|
|
62
59
|
- Else if R0_subfolder → category=other (stop).
|
63
60
|
- Else if NOT R7_files_present → category=other (stop).
|
64
61
|
- Else if R8_nonlegal_only → category=other (stop).
|
65
|
-
- Else if
|
66
|
-
- Else if
|
67
|
-
|
62
|
+
- Else if R9_non_matter_specific_rootname → category=other (stop).
|
63
|
+
- Else if R10_legal_filename_semantics → category=other (stop).
|
64
|
+
- Else if R1_rootname AND any of {R2, R3, R4, R6} AND no multi-matter signal → category=matter.
|
65
|
+
(R5 is supportive only and cannot justify "matter" by itself.)
|
68
66
|
- Else → category=other.
|
69
67
|
|
70
68
|
Normalization
|
71
|
-
- “File with extension”:
|
72
|
-
- Treat hyphens/underscores as separators. Ignore
|
69
|
+
- “File with extension”: period + 1–5 char alphanumeric extension (e.g., .pdf, .docx). Ignore leading/trailing periods.
|
70
|
+
- Treat hyphens/underscores as separators. Ignore extensions for semantic matching beyond presence. Tolerate minor typos.
|
73
71
|
|
74
|
-
Output
|
72
|
+
Output (JSON only; no prose before/after):
|
75
73
|
{"category": "<matter|other>", "reasoning": "≤30 words citing R# and 1–2 evidence tokens>"}
|
76
74
|
|
77
75
|
FOLDER TREE:
|
@@ -87,7 +85,7 @@ class FolderClassifier:
|
|
87
85
|
self.logger = logging.getLogger(__name__)
|
88
86
|
self.model_handle = serve.get_deployment_handle(app_name=app_name, deployment_name=deployment)
|
89
87
|
self.model = model
|
90
|
-
self.logger.info(f"Successfully initialized
|
88
|
+
self.logger.info(f"Successfully initialized Folder Classifier with remote Ray model: {self.model}")
|
91
89
|
|
92
90
|
async def predict(self, request: FolderClassificationRequest) -> Tuple[str, str]:
|
93
91
|
content = ""
|
@@ -98,7 +96,7 @@ class FolderClassifier:
|
|
98
96
|
content = response_dict["choices"][0]["message"]["content"]
|
99
97
|
result = FolderClassification.model_validate_json(content)
|
100
98
|
except Exception as ex:
|
101
|
-
self.logger.
|
99
|
+
self.logger.warning(f"Failed to parse response: {content}\n{ex}")
|
102
100
|
if '"category": "matter"' in content:
|
103
101
|
result = FolderClassification(category="matter", reasoning="NA")
|
104
102
|
else:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: folder-classifier
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.6
|
4
4
|
Summary: Deploy folder classifier API to a Ray cluster
|
5
5
|
Author: Crispin Almodovar
|
6
6
|
Author-email:
|
@@ -11,3 +11,7 @@ Requires-Python: >=3.12
|
|
11
11
|
Description-Content-Type: text/markdown
|
12
12
|
|
13
13
|
# Folder classifier API
|
14
|
+
|
15
|
+
Implements a REST API for that calls an LLM-based classifier model hosted in Ray serve.
|
16
|
+
|
17
|
+
See [Build.md](Build.md) for build and deployment instructions.
|
@@ -1 +0,0 @@
|
|
1
|
-
# Folder classifier API
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{folder_classifier-0.3.4 → folder_classifier-0.3.6}/folder_classifier.egg-info/dependency_links.txt
RENAMED
File without changes
|
{folder_classifier-0.3.4 → folder_classifier-0.3.6}/folder_classifier.egg-info/top_level.txt
RENAMED
File without changes
|
File without changes
|