folder-classifier 0.2.3__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- folder_classifier/app.py +7 -10
- folder_classifier/classifier.py +127 -23
- folder_classifier/dto.py +12 -29
- folder_classifier/util.py +77 -3
- {folder_classifier-0.2.3.dist-info → folder_classifier-0.3.1.dist-info}/METADATA +1 -1
- folder_classifier-0.3.1.dist-info/RECORD +10 -0
- folder_classifier-0.2.3.dist-info/RECORD +0 -10
- {folder_classifier-0.2.3.dist-info → folder_classifier-0.3.1.dist-info}/WHEEL +0 -0
- {folder_classifier-0.2.3.dist-info → folder_classifier-0.3.1.dist-info}/top_level.txt +0 -0
folder_classifier/app.py
CHANGED
@@ -3,14 +3,11 @@ import logging
|
|
3
3
|
from fastapi import FastAPI
|
4
4
|
from ray import serve
|
5
5
|
|
6
|
-
from folder_classifier.
|
7
|
-
|
8
|
-
from folder_classifier.util import flatten_folder
|
9
|
-
from folder_classifier import classifier
|
6
|
+
from folder_classifier.classifier import FolderClassifier
|
7
|
+
from folder_classifier.dto import ModelConfig, FolderClassificationRequest, FolderClassificationResponse
|
10
8
|
|
11
9
|
web_api = FastAPI(title=f"Folder Classifier API")
|
12
10
|
|
13
|
-
|
14
11
|
@serve.deployment
|
15
12
|
@serve.ingress(web_api)
|
16
13
|
class FolderClassifierAPI:
|
@@ -19,13 +16,13 @@ class FolderClassifierAPI:
|
|
19
16
|
assert model_config.app_name and model_config.deployment, "Invalid ModelConfig values"
|
20
17
|
logging.basicConfig(level=logging.INFO)
|
21
18
|
self.logger = logging.getLogger(__name__)
|
22
|
-
self.logger.info(f"Initializing model: {model_config}")
|
23
|
-
|
19
|
+
self.logger.info(f"Initializing Folder Classifier model: {model_config}")
|
20
|
+
model_handle = serve.get_deployment_handle(app_name=model_config.app_name, deployment_name=model_config.deployment)
|
21
|
+
self.classifier = FolderClassifier(model_handle)
|
24
22
|
self.logger.info(f"Successfully initialized Folder Classifier API")
|
25
23
|
|
26
24
|
@web_api.post("/predict")
|
27
25
|
async def predict(self, request: FolderClassificationRequest) -> FolderClassificationResponse:
|
28
|
-
listing = request if isinstance(request, Listing) else Listing(items=flatten_folder(request))
|
29
|
-
category, confidence = classifier.predict(listing)
|
30
26
|
self.logger.info(f"Received request: {request}")
|
31
|
-
|
27
|
+
category, reasoning = await self.classifier.predict(request)
|
28
|
+
return FolderClassificationResponse(category=category, reasoning=reasoning)
|
folder_classifier/classifier.py
CHANGED
@@ -1,25 +1,129 @@
|
|
1
|
-
|
2
|
-
import
|
3
|
-
from
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
1
|
+
import logging
|
2
|
+
import os
|
3
|
+
from typing import Tuple, Dict, Any
|
4
|
+
|
5
|
+
from ray.serve.handle import DeploymentHandle
|
6
|
+
|
7
|
+
from folder_classifier.dto import FolderClassificationRequest, FolderClassification
|
8
|
+
from folder_classifier.util import build_folder, render_tree
|
9
|
+
|
10
|
+
|
11
|
+
MODEL = os.getenv("MODEL", "Qwen3-4B-Instruct-2507-classifier-FP8")
|
12
|
+
|
13
|
+
SYSTEM_PROMPT = """
|
14
|
+
You are a strict text classifier.
|
15
|
+
Output MUST be a single JSON object with exactly two keys: "category" and "reasoning".
|
16
|
+
- "category" ∈ {"matter","other"} (lowercase).
|
17
|
+
- "reasoning" ≤ 30 words.
|
18
|
+
- Do not include any double-quote (") characters inside the value of "reasoning"; use single quotes ' instead.
|
19
|
+
- No backticks, no code fences, no extra text. Return one JSON object only.
|
20
|
+
If unsure, choose "other".
|
21
|
+
""".strip()
|
22
|
+
|
23
|
+
USER_PROMPT_TEMPLATE = """
|
24
|
+
Classify the FOLDER TREE as "matter" or "other".
|
25
|
+
|
26
|
+
Definitions
|
27
|
+
- matter: The FOLDER TREE represents exactly one legal matter/container (one client/case/matter) AND includes at least one legal-work indicator.
|
28
|
+
- other: Anything else, including:
|
29
|
+
(a) the FOLDER TREE appears to contain multiple distinct matters (a container of matters), or
|
30
|
+
(b) the ROOT NAME is a common subfolder/stage/type (e.g., "Correspondence", "Drafts", "Pleadings", "Court Documents", "Billing", "Evidence"), or
|
31
|
+
(c) legal-work indicators are absent, or
|
32
|
+
(d) there are zero files with extensions anywhere in the tree, or
|
33
|
+
(e) contents are exclusively non-legal domains (finance/accounting/IT/admin) with no legal-work indicators, or
|
34
|
+
(f) the ROOT NAME is generic/system-generated and not matter-specific (see R9).
|
35
|
+
|
36
|
+
Decision Rules (apply in order; case-insensitive)
|
37
|
+
RC_container: Any indication the FOLDER TREE holds more than one distinct matter (e.g., multiple top-level matter-like subfolders; repeated separate matter numbers/clients) → category=other.
|
38
|
+
R0_subfolder: ROOT NAME equals a common matter subfolder/stage/type (e.g., Correspondence/Emails, File Notes/Attendance Notes, Searches/Certificates, Court Documents/Pleadings/Evidence/Disclosure/Discovery, Drafts/Final/Signed/Executed, Billing/Invoices/Time Records) → category=other.
|
39
|
+
R7_files_present: Must contain ≥1 file with an extension (e.g., .pdf, .docx) anywhere in the tree; if none → category=other.
|
40
|
+
R8_nonlegal_only: If the tree shows strong non-legal domain signals (finance/accounting/IT/admin) and NO legal-work indicators (R2–R4 or R6), classify as other.
|
41
|
+
• Finance/Accounting examples: BAS/Business Activity Statement, IAS, GST, Tax Return, PAYG, Payroll, Timesheets, Payslips, Superannuation, Xero/MYOB/QuickBooks exports, General Ledger, Trial Balance, Journals, Bank Statements.
|
42
|
+
• IT/Systems examples: Backups, Logs, Source Code, Git, DevOps, Server/Network/VPN, Mailboxes, Google Workspace/Microsoft 365 admin.
|
43
|
+
• Admin-only examples: generic receipts, vendor invoices, expense folders without legal context.
|
44
|
+
• Note: “Billing/Invoices/Time Records” inside a matter is a typical legal subfolder; R8 applies only when legal indicators are entirely absent.
|
45
|
+
R9_generic_rootname: If the ROOT NAME is generic/system-generated and not matter-specific → category=other (even if legal documents appear underneath).
|
46
|
+
• Examples: "CORTO Generated", "Generated Files", "Exports", "Uploads", "Scans", "Shared", "Dropbox Shared", "Google Drive", "OneDrive", "SharePoint", "Archive", "Backup", "Temp", "Incoming", "Outbox", "Bulk Import".
|
47
|
+
• Heuristic: words like generated/export/import/sync/shared/archive/backup/temp/uploads/downloads indicate a system or generic container, not a single matter.
|
48
|
+
R1_rootname: ROOT NAME resembles a single matter/container (matter/file/case number; client/surname/company; or a combination such as “12345 Smith” or “Smith – Contract Dispute”).
|
49
|
+
R2_initial_docs: Early-stage matter docs (cost agreement/disclosure, retainer/engagement, intake/onboarding).
|
50
|
+
R3_legal_docs: Legal document types (agreement, contract, deed, will, affidavit, statement, advice, brief, pleadings, court forms, subpoena, orders, judgment, undertaking, notice of appeal, docket/case forms).
|
51
|
+
R4_legal_subfolders: Typical legal subfolders (correspondence/emails, file notes/attendance notes, searches/certificates, court documents/evidence/disclosure/discovery, drafts/final/signed/executed, billing/invoices/time records).
|
52
|
+
R5_support_filename_patterns: Supportive only (not decisive): versioning (v1/v2/v3), “final”, “executed”, “signed”, eight-digit dates (YYYYMMDD/DDMMYYYY).
|
53
|
+
R6_jurisdiction: Court/jurisdiction/case references (generic court acronyms, registry references, docket patterns).
|
54
|
+
|
55
|
+
Decision
|
56
|
+
- If RC_container → category=other (stop).
|
57
|
+
- Else if R0_subfolder → category=other (stop).
|
58
|
+
- Else if NOT R7_files_present → category=other (stop).
|
59
|
+
- Else if R8_nonlegal_only → category=other (stop).
|
60
|
+
- Else if R9_generic_rootname → category=other (stop).
|
61
|
+
- Else if R1_rootname AND any of {R2_initial_docs, R3_legal_docs, R4_legal_subfolders, R6_jurisdiction} AND no multi-matter signal → category=matter.
|
62
|
+
(R5_support_filename_patterns cannot be used alone to justify "matter"; it is supportive only.)
|
63
|
+
- Else → category=other.
|
64
|
+
|
65
|
+
Normalization
|
66
|
+
- “File with extension”: name containing a period followed by a 1–5 char alphanumeric extension (e.g., .pdf, .docx). Ignore leading/trailing periods.
|
67
|
+
- Treat hyphens/underscores as separators. Ignore file extensions for semantic matching beyond the presence test. Tolerate minor typos.
|
68
|
+
|
69
|
+
Output format (JSON only; no prose before/after):
|
70
|
+
{"category": "<matter|other>", "reasoning": "≤30 words citing R# and 1–2 evidence tokens>"}
|
71
|
+
|
72
|
+
FOLDER TREE:
|
73
|
+
{folder_tree}
|
74
|
+
""".strip()
|
75
|
+
|
76
|
+
|
77
|
+
FOLDER_CLASSIFICATION_SCHEMA = FolderClassification.model_json_schema()
|
78
|
+
|
79
|
+
|
80
|
+
class FolderClassifier:
|
81
|
+
def __init__(self, model_handle: DeploymentHandle):
|
82
|
+
self.logger = logging.getLogger(__name__)
|
83
|
+
self.model_handler = model_handle
|
84
|
+
self.logger.info(f"Successfully initialized FolderClassifier")
|
85
|
+
|
86
|
+
async def predict(self, request: FolderClassificationRequest) -> Tuple[str, str]:
|
87
|
+
try:
|
88
|
+
chat_completion_request = self._to_chat_completion_request(request)
|
89
|
+
response = await self.model_handler.create_chat_completion.remote(chat_completion_request)
|
90
|
+
content = response.choices[0].message.content
|
91
|
+
result = FolderClassification.model_validate_json(content)
|
92
|
+
except Exception as ex:
|
93
|
+
self.logger.error(f"Failed to parse response: {content}\n{ex}")
|
94
|
+
if '"category": "matter"' in content:
|
95
|
+
result = FolderClassification(category="matter", reasoning="NA")
|
96
|
+
else:
|
97
|
+
result = FolderClassification(category="other", reasoning="NA")
|
98
|
+
|
99
|
+
return result.category, result.reasoning
|
100
|
+
|
101
|
+
@staticmethod
|
102
|
+
def _to_chat_completion_request(request: FolderClassificationRequest) -> Dict[str, Any]:
|
103
|
+
input_paths = request.items
|
104
|
+
folder = build_folder(input_paths)
|
105
|
+
folder_tree = render_tree(folder)
|
106
|
+
chat_completion_request = {
|
107
|
+
"model": MODEL,
|
108
|
+
"messages": [
|
109
|
+
{"role": "system", "content": SYSTEM_PROMPT},
|
110
|
+
{"role": "user", "content": USER_PROMPT_TEMPLATE.replace("{folder_tree}", folder_tree)}
|
111
|
+
],
|
112
|
+
"max_tokens": 1024,
|
113
|
+
"temperature": 0.7,
|
114
|
+
"top_p": 0.8,
|
115
|
+
"response_format": {
|
116
|
+
"type": "json_schema",
|
117
|
+
"json_schema": {
|
118
|
+
"name": "FolderClassification",
|
119
|
+
"schema": FOLDER_CLASSIFICATION_SCHEMA,
|
120
|
+
"strict": True,
|
121
|
+
},
|
122
|
+
}
|
123
|
+
}
|
124
|
+
return chat_completion_request
|
125
|
+
|
126
|
+
|
127
|
+
|
24
128
|
|
25
129
|
|
folder_classifier/dto.py
CHANGED
@@ -1,9 +1,5 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
from textwrap import dedent
|
4
1
|
from typing import List, Union, Literal, Optional
|
5
|
-
|
6
|
-
from pydantic import BaseModel, Field, confloat
|
2
|
+
from pydantic import BaseModel, Field, ConfigDict
|
7
3
|
|
8
4
|
|
9
5
|
class ModelConfig(BaseModel):
|
@@ -23,32 +19,19 @@ class File(BaseModel):
|
|
23
19
|
class Folder(BaseModel):
|
24
20
|
name: str
|
25
21
|
type: Literal["folder"]
|
26
|
-
|
27
|
-
|
28
|
-
model_config = {
|
29
|
-
"json_schema_extra": {
|
30
|
-
# Override the OpenAPI example to avoid the default 'string' entry
|
31
|
-
"example": dedent("""{
|
32
|
-
"name": "string",
|
33
|
-
"type": "folder",
|
34
|
-
"items": [
|
35
|
-
{
|
36
|
-
"name": "string",
|
37
|
-
"type": "file"
|
38
|
-
}
|
39
|
-
]
|
40
|
-
}""")
|
41
|
-
}
|
42
|
-
}
|
43
|
-
|
44
|
-
|
45
|
-
class Listing(BaseModel):
|
46
|
-
items: List[str]
|
22
|
+
items: List[Union[File, 'Folder']] = Field(default_factory=list)
|
23
|
+
|
47
24
|
|
25
|
+
class FolderClassificationRequest(BaseModel):
|
26
|
+
items: List[str]
|
48
27
|
|
49
|
-
Folder.model_rebuild()
|
50
|
-
FolderClassificationRequest = Listing
|
51
28
|
|
52
29
|
class FolderClassificationResponse(BaseModel):
|
53
30
|
category: Literal["matter", "other"]
|
54
|
-
|
31
|
+
reasoning: Optional[str] = None
|
32
|
+
|
33
|
+
|
34
|
+
class FolderClassification(BaseModel):
|
35
|
+
category: Literal["matter", "other"]
|
36
|
+
reasoning: str
|
37
|
+
model_config = ConfigDict(extra="forbid")
|
folder_classifier/util.py
CHANGED
@@ -1,6 +1,78 @@
|
|
1
|
-
from typing import List
|
1
|
+
from typing import List, Union
|
2
2
|
|
3
|
-
from folder_classifier.dto import Folder
|
3
|
+
from folder_classifier.dto import Folder, File
|
4
|
+
|
5
|
+
|
6
|
+
def build_folder(paths: List[str]) -> Folder:
|
7
|
+
"""
|
8
|
+
Create a Folder tree from a list of file paths;
|
9
|
+
The file paths are delimited by "/" - leaf segments are assumed to be files
|
10
|
+
"""
|
11
|
+
if not paths:
|
12
|
+
raise ValueError("No paths provided")
|
13
|
+
|
14
|
+
# Get all directory prefixes
|
15
|
+
prefix_set = set()
|
16
|
+
for p in paths:
|
17
|
+
parts = p.split('/')
|
18
|
+
for i in range(1, len(parts)):
|
19
|
+
prefix_set.add('/'.join(parts[:i]))
|
20
|
+
|
21
|
+
# Sort by depth so parents are created before children
|
22
|
+
sorted_paths = sorted(paths, key=lambda x: x.count('/'))
|
23
|
+
|
24
|
+
# Create root folder
|
25
|
+
root_name = sorted_paths[0].split('/')[0]
|
26
|
+
root = Folder(name=root_name, type="folder", items=[])
|
27
|
+
|
28
|
+
# Build the tree
|
29
|
+
for p in sorted_paths:
|
30
|
+
parts = p.split('/')
|
31
|
+
current = root
|
32
|
+
for idx, part in enumerate(parts[1:], start=1):
|
33
|
+
full_path = '/'.join(parts[:idx+1])
|
34
|
+
is_last = idx == len(parts) - 1
|
35
|
+
|
36
|
+
# existing item
|
37
|
+
existing = next((item for item in current.items if item.name == part), None)
|
38
|
+
if existing:
|
39
|
+
if isinstance(existing, Folder):
|
40
|
+
current = existing
|
41
|
+
continue
|
42
|
+
|
43
|
+
# Determine type for new item
|
44
|
+
if is_last and full_path not in prefix_set:
|
45
|
+
new_item = File(name=part, type="file")
|
46
|
+
else:
|
47
|
+
new_item = Folder(name=part, type="folder", items=[])
|
48
|
+
|
49
|
+
current.items.append(new_item)
|
50
|
+
if isinstance(new_item, Folder):
|
51
|
+
current = new_item
|
52
|
+
|
53
|
+
return root
|
54
|
+
|
55
|
+
|
56
|
+
def render_tree(folder: Folder) -> str:
|
57
|
+
"""
|
58
|
+
Render Folder tree using ASCII tree characters (├──, └──, │).
|
59
|
+
"""
|
60
|
+
lines: List[str] = []
|
61
|
+
|
62
|
+
def recurse(node: Union[Folder, File], prefix: str, is_last: bool):
|
63
|
+
connector = "└── " if is_last else "├── "
|
64
|
+
lines.append(f"{prefix}{connector}{node.name}")
|
65
|
+
if isinstance(node, Folder):
|
66
|
+
child_prefix = prefix + (" " if is_last else "│ ")
|
67
|
+
for idx, child in enumerate(node.items):
|
68
|
+
recurse(child, child_prefix, idx == len(node.items) - 1)
|
69
|
+
|
70
|
+
# root
|
71
|
+
lines.append(folder.name)
|
72
|
+
for idx, child in enumerate(folder.items):
|
73
|
+
recurse(child, "", idx == len(folder.items) - 1)
|
74
|
+
|
75
|
+
return "\n".join(lines)
|
4
76
|
|
5
77
|
|
6
78
|
def flatten_folder(folder: Folder, parent_path: str = "") -> List[str]:
|
@@ -18,4 +90,6 @@ def flatten_folder(folder: Folder, parent_path: str = "") -> List[str]:
|
|
18
90
|
else:
|
19
91
|
# Recursively flatten subfolders
|
20
92
|
paths.extend(flatten_folder(item, current_path))
|
21
|
-
return paths
|
93
|
+
return paths
|
94
|
+
|
95
|
+
|
@@ -0,0 +1,10 @@
|
|
1
|
+
folder_classifier/__init__.py,sha256=k0YWZyUNe7myJiKeX0OaXtJ30_3EGE-vsZiAUbqa-3E,46
|
2
|
+
folder_classifier/app.py,sha256=doaXkKnpe-LNztkMkuROutjId7XPS_t9G7m9dSXt-ow,1330
|
3
|
+
folder_classifier/classifier.py,sha256=3V4M-x5K7vASStsnPlrW2M9tA6uWsKA4L1rCMHqCQ_M,7705
|
4
|
+
folder_classifier/deploy.py,sha256=UQTbQjR_JX92Xo8L5EbK4nPg_VJmwWDmzMQSjP3-7iQ,412
|
5
|
+
folder_classifier/dto.py,sha256=pCRErTIflx8zMe7dD_GcnaBagPYpuZy2vIZCAbb0i0Q,767
|
6
|
+
folder_classifier/util.py,sha256=4zk52bPVyoVU0FfDTmeK5JdM4y0IbnuaPS1PWc9txa4,3065
|
7
|
+
folder_classifier-0.3.1.dist-info/METADATA,sha256=x4Dv4CCUXoo0GAL5DX3I3FPrHvTWYKowY1UrkMGD2Gs,392
|
8
|
+
folder_classifier-0.3.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
9
|
+
folder_classifier-0.3.1.dist-info/top_level.txt,sha256=36ugc9pEbNQ-mnzz4Ot2WVjY3t_LzAN6XOCjDFP4p4k,18
|
10
|
+
folder_classifier-0.3.1.dist-info/RECORD,,
|
@@ -1,10 +0,0 @@
|
|
1
|
-
folder_classifier/__init__.py,sha256=k0YWZyUNe7myJiKeX0OaXtJ30_3EGE-vsZiAUbqa-3E,46
|
2
|
-
folder_classifier/app.py,sha256=FqoBp_KQ3yIfoHTagdDTVsLbIj8luCn7en533Q870x8,1443
|
3
|
-
folder_classifier/classifier.py,sha256=YVKXvNAHny167H0Iv0GlRJEtRMAYwGea3cUVeGC4_sI,668
|
4
|
-
folder_classifier/deploy.py,sha256=UQTbQjR_JX92Xo8L5EbK4nPg_VJmwWDmzMQSjP3-7iQ,412
|
5
|
-
folder_classifier/dto.py,sha256=Xb1ozZQkfC45cbBoNOZ8xfkillJWFKXKWQc9CZxppXI,1260
|
6
|
-
folder_classifier/util.py,sha256=t-ma2suHovfNutogJb9jailRbRpIg4qv-zph3dHb2og,692
|
7
|
-
folder_classifier-0.2.3.dist-info/METADATA,sha256=MZ_BgIBlPrq86na7t9aMXA8EONA05MaHPofJV1P6hZM,392
|
8
|
-
folder_classifier-0.2.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
9
|
-
folder_classifier-0.2.3.dist-info/top_level.txt,sha256=36ugc9pEbNQ-mnzz4Ot2WVjY3t_LzAN6XOCjDFP4p4k,18
|
10
|
-
folder_classifier-0.2.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|