folder-classifier 0.3.5__tar.gz → 0.3.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {folder_classifier-0.3.5 → folder_classifier-0.3.7}/PKG-INFO +1 -1
- folder_classifier-0.3.7/folder_classifier/classifier.py +131 -0
- {folder_classifier-0.3.5 → folder_classifier-0.3.7}/folder_classifier.egg-info/PKG-INFO +1 -1
- {folder_classifier-0.3.5 → folder_classifier-0.3.7}/setup.cfg +1 -1
- folder_classifier-0.3.5/folder_classifier/classifier.py +0 -135
- {folder_classifier-0.3.5 → folder_classifier-0.3.7}/README.md +0 -0
- {folder_classifier-0.3.5 → folder_classifier-0.3.7}/folder_classifier/__init__.py +0 -0
- {folder_classifier-0.3.5 → folder_classifier-0.3.7}/folder_classifier/app.py +0 -0
- {folder_classifier-0.3.5 → folder_classifier-0.3.7}/folder_classifier/deploy.py +0 -0
- {folder_classifier-0.3.5 → folder_classifier-0.3.7}/folder_classifier/dto.py +0 -0
- {folder_classifier-0.3.5 → folder_classifier-0.3.7}/folder_classifier/util.py +0 -0
- {folder_classifier-0.3.5 → folder_classifier-0.3.7}/folder_classifier.egg-info/SOURCES.txt +0 -0
- {folder_classifier-0.3.5 → folder_classifier-0.3.7}/folder_classifier.egg-info/dependency_links.txt +0 -0
- {folder_classifier-0.3.5 → folder_classifier-0.3.7}/folder_classifier.egg-info/top_level.txt +0 -0
- {folder_classifier-0.3.5 → folder_classifier-0.3.7}/pyproject.toml +0 -0
@@ -0,0 +1,131 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
from typing import Tuple, Dict, Any
|
4
|
+
|
5
|
+
from ray import serve
|
6
|
+
|
7
|
+
from folder_classifier.dto import FolderClassificationRequest, FolderClassification
|
8
|
+
from folder_classifier.util import build_folder, render_tree
|
9
|
+
|
10
|
+
SYSTEM_PROMPT = r"""
|
11
|
+
You are a strict text classifier. Provide a single JSON object with exactly two keys: "category" and "reasoning".
|
12
|
+
- "category": either "matter" or "other" (lowercase).
|
13
|
+
- "reasoning": 1–2 short explanation referencing the key rule(s) that decided it.
|
14
|
+
No markdown (no backticks or code blocks) or any extra text outside the JSON. No chain-of-thought explanations or extra keys. If uncertain, choose "other".
|
15
|
+
""".strip()
|
16
|
+
|
17
|
+
USER_PROMPT_TEMPLATE = r"""
|
18
|
+
Task: Classify the folder tree as 'matter' or 'other'.
|
19
|
+
|
20
|
+
Decision rules (apply in order):
|
21
|
+
1) If there are no files with extension anywhere in the tree , classify as 'other'.
|
22
|
+
2) If the root folder appears to be a container of multiple matters , classify as 'other'.
|
23
|
+
3) If the root folder is or ends with a common subfolder name or descriptor found inside legal matters (e.g.,"Email", "Summons" "Emails", "Documents", "Correspondence", "Drafts", "Pleadings", "Court Documents", "Billing", or similar descriptive folder types), classify as 'other' even if it contains legal documents.
|
24
|
+
|
25
|
+
4) If the root folder name matches any Matter Folder Naming Pattern and there is at least one file with an extension anywhere in the tree (including subfolders), and there is at least one file, subfolder, or filename that directly and unambiguously references a legal, client-matter, or professional context—for example, a legal document type, an initial or core legal document, clear legal terminology, a jurisdiction/court reference, the name of a law firm or legal/financial professional, or an activity specific to legal work—classify as 'matter'.
|
26
|
+
Do not classify as 'matter' if the folder only contains general business documents (e.g., invoices, estimates, generic correspondence) and there are no strong indicators of legal, client, or matter-related content as defined above
|
27
|
+
5) If the root folder name very compellingly looks like a matter folder e.g (11206 - AcmeX Pty v Acme Corp), classify as 'matter' even if the documents are not initial/core/legal documents.
|
28
|
+
6) If none of the above apply, classify as 'other'.
|
29
|
+
|
30
|
+
Matter Folder Naming Patterns (case-insensitive; separators like space, hyphen, underscore are fine):
|
31
|
+
• Matter number alone or combined with a client/surname/company (e.g., "12345", "12345 Smith", "Smith - Contract Dispute").
|
32
|
+
• Client name/surname/company/business name, optionally with matter type or client reference (e.g., "Brown – Lease Review", "Jones Family – Estate").
|
33
|
+
• Common file-number prefixes/suffixes (e.g., "MAT-1234", "CLT001", "2025-0001", "ACME_2024_Lease").
|
34
|
+
• Suggested regex-style hints (not strict requirements):
|
35
|
+
- Numeric ID: ^\d{4,}$
|
36
|
+
- Prefixed ID: ^(MAT|CLT|FILE|CASE|REF)[-_]?\d{3,}
|
37
|
+
- Name + type: ^[A-Za-z].*(Lease|Contract|Estate|Dispute|Sale|Acquisition|Conveyance|Family|Probate|Litigation).*
|
38
|
+
|
39
|
+
Initial or Core Documents (early-stage client docs):
|
40
|
+
• File Cover Sheet
|
41
|
+
• Cost Agreement / Costs Disclosure
|
42
|
+
• Retainer Instructions / Engagement Letter
|
43
|
+
• Onboarding Questionnaire / Client Intake Form
|
44
|
+
|
45
|
+
Legal Document Types (non-exhaustive):
|
46
|
+
• Contract, Deed, Agreement, Will
|
47
|
+
• Affidavit, Statement
|
48
|
+
• Brief to Counsel, Advice
|
49
|
+
• Court Forms, Pleadings, Subpoena, Orders, Judgment, Undertaking
|
50
|
+
|
51
|
+
Subfolder Indicators (often—but not always—present in matters):
|
52
|
+
• Correspondence / Emails
|
53
|
+
• File Notes / Attendance Notes
|
54
|
+
• Searches / Certificates
|
55
|
+
• Court Documents / Evidence / Disclosure / Discovery
|
56
|
+
• Drafts / Final / Signed / Executed
|
57
|
+
• Billing / Invoices / Time Records
|
58
|
+
|
59
|
+
File Naming Patterns (helpful signals):
|
60
|
+
• Dates in YYYYMMDD or DDMMYYYY.
|
61
|
+
• Legal terminology (e.g., "Letter to other side", "Draft_Affidavit_v3").
|
62
|
+
• Versioning (v1, v2, Final, Executed, Signed).
|
63
|
+
• Jurisdiction or court references (e.g., NSWSC, VCAT, Family Court, FCFCOA).
|
64
|
+
|
65
|
+
Definitions/assumptions:
|
66
|
+
• "Document" = a file (e.g., .pdf, .docx, .rtf, .txt, .xlsx, .msg/.eml). Folders are not documents.
|
67
|
+
• Treat "templates" or "precedents" as weak signals unless clearly client/matter-specific.
|
68
|
+
• Evaluate only the content shown in the tree—do not infer from outside knowledge.
|
69
|
+
|
70
|
+
Output format (JSON only, no extra text):
|
71
|
+
{"category": "<matter|other>", "reasoning": "<1–2 short explanation referencing the key rule(s) that decided it>"}
|
72
|
+
|
73
|
+
FOLDER TREE:
|
74
|
+
{folder_tree}
|
75
|
+
""".strip()
|
76
|
+
|
77
|
+
|
78
|
+
FOLDER_CLASSIFICATION_SCHEMA = FolderClassification.model_json_schema()
|
79
|
+
|
80
|
+
|
81
|
+
class FolderClassifier:
|
82
|
+
def __init__(self, app_name: str, deployment: str, model: str):
|
83
|
+
self.logger = logging.getLogger(__name__)
|
84
|
+
self.model_handle = serve.get_deployment_handle(app_name=app_name, deployment_name=deployment)
|
85
|
+
self.model = model
|
86
|
+
self.logger.info(f"Successfully initialized Folder Classifier with remote Ray model: {self.model}")
|
87
|
+
|
88
|
+
async def predict(self, request: FolderClassificationRequest) -> Tuple[str, str]:
|
89
|
+
content = ""
|
90
|
+
try:
|
91
|
+
chat_completion_request = self._to_chat_completion_request(request)
|
92
|
+
response = await self.model_handle.create_chat_completion_internal.remote(chat_completion_request)
|
93
|
+
response_dict = json.loads(response.body)
|
94
|
+
content = response_dict["choices"][0]["message"]["content"]
|
95
|
+
result = FolderClassification.model_validate_json(content)
|
96
|
+
except Exception as ex:
|
97
|
+
self.logger.warning(f"Failed to parse response: {content}\n{ex}")
|
98
|
+
if '"category": "matter"' in content:
|
99
|
+
result = FolderClassification(category="matter", reasoning="NA")
|
100
|
+
else:
|
101
|
+
result = FolderClassification(category="other", reasoning="NA")
|
102
|
+
return result.category, result.reasoning
|
103
|
+
|
104
|
+
def _to_chat_completion_request(self, request: FolderClassificationRequest) -> Dict[str, Any]:
|
105
|
+
input_paths = request.items
|
106
|
+
folder = build_folder(input_paths)
|
107
|
+
folder_tree = render_tree(folder)
|
108
|
+
chat_completion_request = {
|
109
|
+
"model": self.model,
|
110
|
+
"messages": [
|
111
|
+
{"role": "system", "content": SYSTEM_PROMPT},
|
112
|
+
{"role": "user", "content": USER_PROMPT_TEMPLATE.replace("{folder_tree}", folder_tree)}
|
113
|
+
],
|
114
|
+
"max_tokens": 1024,
|
115
|
+
"temperature": 0.7,
|
116
|
+
"top_p": 0.8,
|
117
|
+
"response_format": {
|
118
|
+
"type": "json_schema",
|
119
|
+
"json_schema": {
|
120
|
+
"name": "FolderClassification",
|
121
|
+
"schema": FOLDER_CLASSIFICATION_SCHEMA,
|
122
|
+
"strict": True,
|
123
|
+
},
|
124
|
+
}
|
125
|
+
}
|
126
|
+
return chat_completion_request
|
127
|
+
|
128
|
+
|
129
|
+
|
130
|
+
|
131
|
+
|
@@ -1,135 +0,0 @@
|
|
1
|
-
import json
|
2
|
-
import logging
|
3
|
-
from typing import Tuple, Dict, Any
|
4
|
-
|
5
|
-
from ray import serve
|
6
|
-
|
7
|
-
from folder_classifier.dto import FolderClassificationRequest, FolderClassification
|
8
|
-
from folder_classifier.util import build_folder, render_tree
|
9
|
-
|
10
|
-
SYSTEM_PROMPT = """
|
11
|
-
You are a strict text classifier.
|
12
|
-
Output MUST be a single JSON object with exactly two keys: "category" and "reasoning".
|
13
|
-
- "category" ∈ {"matter","other"} (lowercase).
|
14
|
-
- "reasoning" ≤ 30 words, citing rule IDs (e.g., RC+R7 or R1+R3) and up to two brief evidence tokens.
|
15
|
-
- Do not include any double-quote (") characters inside the value of "reasoning"; use single quotes ' instead.
|
16
|
-
- No backticks, no code fences, no extra text. Return one JSON object only.
|
17
|
-
No step-by-step thinking. No extra keys. If unsure, choose "other".
|
18
|
-
""".strip()
|
19
|
-
|
20
|
-
USER_PROMPT_TEMPLATE = """
|
21
|
-
Classify the FOLDER TREE as "matter" or "other".
|
22
|
-
|
23
|
-
Definitions
|
24
|
-
- matter: The FOLDER TREE represents exactly one legal matter/container (one client/case/matter) AND includes at least one legal-work indicator.
|
25
|
-
- other: Anything else, including:
|
26
|
-
(a) the FOLDER TREE appears to contain multiple distinct matters (a container of matters), or
|
27
|
-
(b) the ROOT NAME is a common subfolder/stage/type (e.g., "Correspondence", "Drafts", "Pleadings", "Court Documents", "Billing", "Evidence"), or
|
28
|
-
(c) legal-work indicators are absent, or
|
29
|
-
(d) there are zero files with extensions anywhere in the tree, or
|
30
|
-
(e) contents are exclusively non-legal domains (finance/accounting/IT/admin) with no legal-work indicators, or
|
31
|
-
(f) the ROOT NAME is non-matter-specific, including either:
|
32
|
-
(f1) system/provenance roots (e.g., “CORTO Generated”, “Exports”, “Uploads”, “Dropbox Shared”, “OneDrive”, “SharePoint”,
|
33
|
-
“Archive”, “Backup”, “Temp”, “Incoming”, “Outbox”, “Bulk Import”), or
|
34
|
-
(f2) informal/placeholder roots lacking client/company/matter cues (e.g., “hello”, “test”, “misc”, “docs”, “files”,
|
35
|
-
“images”, “photos”, “scans”, “new folder”, “untitled”, “random”; very short single tokens with no digits).
|
36
|
-
|
37
|
-
Decision Rules (apply in order; case-insensitive)
|
38
|
-
RC_container: Any indication the FOLDER TREE holds more than one distinct matter → category=other.
|
39
|
-
• Signals include: ≥2 top-level subfolders whose names match matter-like patterns (e.g., “<number> - <id>”, “<client> - <topic>”), even if those subfolders are empty.
|
40
|
-
• Repetitive top-level “matter-number” patterns also indicate a container.
|
41
|
-
R0_subfolder: ROOT NAME equals a common matter subfolder/stage/type (e.g., Correspondence/Emails, File Notes/Attendance Notes, Searches/Certificates, Court Documents/Pleadings/Evidence/Disclosure/Discovery, Drafts/Final/Signed/Executed, Billing/Invoices/Time Records) → category=other.
|
42
|
-
R7_files_present: Must contain ≥1 file with an extension (e.g., .pdf, .docx) anywhere in the tree; if none → category=other.
|
43
|
-
R8_nonlegal_only: If the tree shows strong non-legal domain signals and NO legal-work indicators (R2–R4 or R6), classify as other.
|
44
|
-
• Finance/Accounting examples: BAS/IAS/GST/Tax Return, PAYG, payroll/timesheets/payslips, superannuation, Xero/MYOB/QuickBooks exports, GL/Trial Balance/Journals, bank statements, invoices/receipts, reconciliations, aged AR/AP, debtor/creditor lists, debtor collection schedules, AR “demand/reminder” letters in collections context.
|
45
|
-
• IT/Systems examples: backups, logs, source code, Git/DevOps, server/network/VPN, mailboxes, workspace/365 admin.
|
46
|
-
• Admin-only examples: generic receipts/vendor invoices/expenses without legal context.
|
47
|
-
R9_non_matter_specific_rootname: ROOT NAME is non-matter-specific → category=other.
|
48
|
-
• R9a_system_provenance: matches system/provenance roots (generated/export/import/shared/archive/backup/temp/uploads/
|
49
|
-
downloads/platform names like Dropbox/OneDrive/SharePoint).
|
50
|
-
• R9b_placeholder_informal: matches informal/placeholder roots with no client/company/matter cues (single common words,
|
51
|
-
“new folder”, “untitled”, short tokens without digits).
|
52
|
-
R1_rootname: ROOT NAME resembles a single matter/container (matter/case/file number; client/surname/company; or a combination like “12345 Smith”, “Smith – Contract Dispute”, “Brown – Lease Review”).
|
53
|
-
R2_initial_docs: Early-stage matter docs (cost agreement/disclosure, retainer/engagement, intake/onboarding).
|
54
|
-
R3_legal_docs: Legal document types (agreement, contract, deed, will, affidavit, statement, advice, brief, pleadings, court forms, subpoena, orders, judgment, undertaking, notice of appeal, docket/case forms).
|
55
|
-
R4_legal_subfolders: Typical legal subfolders (correspondence/emails, file notes/attendance notes, searches/certificates, court documents/evidence/disclosure/discovery, drafts/final/signed/executed, billing/invoices/time records).
|
56
|
-
R5_support_filename_patterns: Supportive only (not decisive): versioning (v1/v2/v3), “final”, “executed”, “signed”, eight-digit dates (YYYYMMDD/DDMMYYYY).
|
57
|
-
R6_jurisdiction: Court/jurisdiction/case references (generic court acronyms, registry references, docket patterns).
|
58
|
-
|
59
|
-
Decision
|
60
|
-
- If RC_container → category=other (stop).
|
61
|
-
- Else if R0_subfolder → category=other (stop).
|
62
|
-
- Else if NOT R7_files_present → category=other (stop).
|
63
|
-
- Else if R8_nonlegal_only → category=other (stop).
|
64
|
-
- Else if R9_generic_rootname → category=other (stop).
|
65
|
-
- Else if R10_weak_rootname → category=other (stop).
|
66
|
-
- Else if R1_rootname AND any of {R2, R3, R4, R6} AND no multi-matter signal → category=matter.
|
67
|
-
(R5 is supportive only and cannot justify "matter" by itself.)
|
68
|
-
- Else → category=other.
|
69
|
-
|
70
|
-
Normalization
|
71
|
-
- “File with extension”: period + 1–5 char alphanumeric extension (e.g., .pdf, .docx). Ignore leading/trailing periods.
|
72
|
-
- Treat hyphens/underscores as separators. Ignore extensions for semantic matching beyond presence. Tolerate minor typos.
|
73
|
-
|
74
|
-
Output (JSON only; no prose before/after):
|
75
|
-
{"category": "<matter|other>", "reasoning": "<≤30 words citing R# and 1–2 evidence tokens>"}
|
76
|
-
|
77
|
-
FOLDER TREE:
|
78
|
-
{folder_tree}
|
79
|
-
""".strip()
|
80
|
-
|
81
|
-
|
82
|
-
FOLDER_CLASSIFICATION_SCHEMA = FolderClassification.model_json_schema()
|
83
|
-
|
84
|
-
|
85
|
-
class FolderClassifier:
|
86
|
-
def __init__(self, app_name: str, deployment: str, model: str):
|
87
|
-
self.logger = logging.getLogger(__name__)
|
88
|
-
self.model_handle = serve.get_deployment_handle(app_name=app_name, deployment_name=deployment)
|
89
|
-
self.model = model
|
90
|
-
self.logger.info(f"Successfully initialized Folder Classifier with remote Ray model: {self.model}")
|
91
|
-
|
92
|
-
async def predict(self, request: FolderClassificationRequest) -> Tuple[str, str]:
|
93
|
-
content = ""
|
94
|
-
try:
|
95
|
-
chat_completion_request = self._to_chat_completion_request(request)
|
96
|
-
response = await self.model_handle.create_chat_completion_internal.remote(chat_completion_request)
|
97
|
-
response_dict = json.loads(response.body)
|
98
|
-
content = response_dict["choices"][0]["message"]["content"]
|
99
|
-
result = FolderClassification.model_validate_json(content)
|
100
|
-
except Exception as ex:
|
101
|
-
self.logger.warning(f"Failed to parse response: {content}\n{ex}")
|
102
|
-
if '"category": "matter"' in content:
|
103
|
-
result = FolderClassification(category="matter", reasoning="NA")
|
104
|
-
else:
|
105
|
-
result = FolderClassification(category="other", reasoning="NA")
|
106
|
-
return result.category, result.reasoning
|
107
|
-
|
108
|
-
def _to_chat_completion_request(self, request: FolderClassificationRequest) -> Dict[str, Any]:
|
109
|
-
input_paths = request.items
|
110
|
-
folder = build_folder(input_paths)
|
111
|
-
folder_tree = render_tree(folder)
|
112
|
-
chat_completion_request = {
|
113
|
-
"model": self.model,
|
114
|
-
"messages": [
|
115
|
-
{"role": "system", "content": SYSTEM_PROMPT},
|
116
|
-
{"role": "user", "content": USER_PROMPT_TEMPLATE.replace("{folder_tree}", folder_tree)}
|
117
|
-
],
|
118
|
-
"max_tokens": 1024,
|
119
|
-
"temperature": 0.7,
|
120
|
-
"top_p": 0.8,
|
121
|
-
"response_format": {
|
122
|
-
"type": "json_schema",
|
123
|
-
"json_schema": {
|
124
|
-
"name": "FolderClassification",
|
125
|
-
"schema": FOLDER_CLASSIFICATION_SCHEMA,
|
126
|
-
"strict": True,
|
127
|
-
},
|
128
|
-
}
|
129
|
-
}
|
130
|
-
return chat_completion_request
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{folder_classifier-0.3.5 → folder_classifier-0.3.7}/folder_classifier.egg-info/dependency_links.txt
RENAMED
File without changes
|
{folder_classifier-0.3.5 → folder_classifier-0.3.7}/folder_classifier.egg-info/top_level.txt
RENAMED
File without changes
|
File without changes
|