folder-classifier 0.3.4__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
folder_classifier/app.py CHANGED
@@ -17,7 +17,7 @@ class FolderClassifierAPI:
17
17
  logging.basicConfig(level=logging.INFO)
18
18
  self.logger = logging.getLogger(__name__)
19
19
  self.classifier = FolderClassifier(app_name=model_config.app_name, deployment=model_config.deployment, model=model_config.model)
20
- self.logger.info(f"Successfully initialized Folder Classifier using config: {model_config}")
20
+ self.logger.info(f"Successfully initialized Folder Classifier API using config: {model_config}")
21
21
 
22
22
  @web_api.post("/predict")
23
23
  async def predict(self, request: FolderClassificationRequest) -> FolderClassificationResponse:
@@ -1,28 +1,20 @@
1
1
  import json
2
2
  import logging
3
- import os
4
3
  from typing import Tuple, Dict, Any
5
4
 
6
5
  from ray import serve
7
- from ray.serve.handle import DeploymentHandle
8
6
 
9
7
  from folder_classifier.dto import FolderClassificationRequest, FolderClassification
10
8
  from folder_classifier.util import build_folder, render_tree
11
- from openai import AsyncOpenAI
12
-
13
-
14
- OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", "https://internal-multimodal-services.dev.cortoaws.com/serve/qwen3-4b-classifier/v1")
15
- OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "DUMMY_KEY")
16
- MODEL = os.getenv("MODEL", "Qwen3-4B-Instruct-2507-classifier-FP8")
17
9
 
18
10
  SYSTEM_PROMPT = """
19
11
  You are a strict text classifier.
20
12
  Output MUST be a single JSON object with exactly two keys: "category" and "reasoning".
21
13
  - "category" ∈ {"matter","other"} (lowercase).
22
- - "reasoning" ≤ 30 words.
14
+ - "reasoning" ≤ 30 words, citing rule IDs (e.g., RC+R7 or R1+R3) and up to two brief evidence tokens.
23
15
  - Do not include any double-quote (") characters inside the value of "reasoning"; use single quotes ' instead.
24
16
  - No backticks, no code fences, no extra text. Return one JSON object only.
25
- If unsure, choose "other".
17
+ No step-by-step thinking. No extra keys. If unsure, choose "other".
26
18
  """.strip()
27
19
 
28
20
  USER_PROMPT_TEMPLATE = """
@@ -36,21 +28,28 @@ Definitions
36
28
  (c) legal-work indicators are absent, or
37
29
  (d) there are zero files with extensions anywhere in the tree, or
38
30
  (e) contents are exclusively non-legal domains (finance/accounting/IT/admin) with no legal-work indicators, or
39
- (f) the ROOT NAME is generic/system-generated and not matter-specific (see R9).
31
+ (f) the ROOT NAME is non-matter-specific, including either:
32
+ (f1) system/provenance roots (e.g., “CORTO Generated”, “Exports”, “Uploads”, “Dropbox Shared”, “OneDrive”, “SharePoint”,
33
+ “Archive”, “Backup”, “Temp”, “Incoming”, “Outbox”, “Bulk Import”), or
34
+ (f2) informal/placeholder roots lacking client/company/matter cues (e.g., “hello”, “test”, “misc”, “docs”, “files”,
35
+ “images”, “photos”, “scans”, “new folder”, “untitled”, “random”; very short single tokens with no digits).
40
36
 
41
37
  Decision Rules (apply in order; case-insensitive)
42
- RC_container: Any indication the FOLDER TREE holds more than one distinct matter (e.g., multiple top-level matter-like subfolders; repeated separate matter numbers/clients) → category=other.
38
+ RC_container: Any indication the FOLDER TREE holds more than one distinct matter → category=other.
39
+ • Signals include: ≥2 top-level subfolders whose names match matter-like patterns (e.g., “<number> - <id>”, “<client> - <topic>”), even if those subfolders are empty.
40
+ • Repetitive top-level “matter-number” patterns also indicate a container.
43
41
  R0_subfolder: ROOT NAME equals a common matter subfolder/stage/type (e.g., Correspondence/Emails, File Notes/Attendance Notes, Searches/Certificates, Court Documents/Pleadings/Evidence/Disclosure/Discovery, Drafts/Final/Signed/Executed, Billing/Invoices/Time Records) → category=other.
44
42
  R7_files_present: Must contain ≥1 file with an extension (e.g., .pdf, .docx) anywhere in the tree; if none → category=other.
45
- R8_nonlegal_only: If the tree shows strong non-legal domain signals (finance/accounting/IT/admin) and NO legal-work indicators (R2–R4 or R6), classify as other.
46
- • Finance/Accounting examples: BAS/Business Activity Statement, IAS, GST, Tax Return, PAYG, Payroll, Timesheets, Payslips, Superannuation, Xero/MYOB/QuickBooks exports, General Ledger, Trial Balance, Journals, Bank Statements.
47
- • IT/Systems examples: Backups, Logs, Source Code, Git, DevOps, Server/Network/VPN, Mailboxes, Google Workspace/Microsoft 365 admin.
48
- • Admin-only examples: generic receipts, vendor invoices, expense folders without legal context.
49
- • Note: “Billing/Invoices/Time Records” inside a matter is a typical legal subfolder; R8 applies only when legal indicators are entirely absent.
50
- R9_generic_rootname: If the ROOT NAME is generic/system-generated and not matter-specific → category=other (even if legal documents appear underneath).
51
- Examples: "CORTO Generated", "Generated Files", "Exports", "Uploads", "Scans", "Shared", "Dropbox Shared", "Google Drive", "OneDrive", "SharePoint", "Archive", "Backup", "Temp", "Incoming", "Outbox", "Bulk Import".
52
- Heuristic: words like generated/export/import/sync/shared/archive/backup/temp/uploads/downloads indicate a system or generic container, not a single matter.
53
- R1_rootname: ROOT NAME resembles a single matter/container (matter/file/case number; client/surname/company; or a combination such as 12345 Smith” or “Smith – Contract Dispute”).
43
+ R8_nonlegal_only: If the tree shows strong non-legal domain signals and NO legal-work indicators (R2–R4 or R6), classify as other.
44
+ • Finance/Accounting examples: BAS/IAS/GST/Tax Return, PAYG, payroll/timesheets/payslips, superannuation, Xero/MYOB/QuickBooks exports, GL/Trial Balance/Journals, bank statements, invoices/receipts, reconciliations, aged AR/AP, debtor/creditor lists, debtor collection schedules, AR “demand/reminder” letters in collections context.
45
+ • IT/Systems examples: backups, logs, source code, Git/DevOps, server/network/VPN, mailboxes, workspace/365 admin.
46
+ • Admin-only examples: generic receipts/vendor invoices/expenses without legal context.
47
+ R9_non_matter_specific_rootname: ROOT NAME is non-matter-specific category=other.
48
+ • R9a_system_provenance: matches system/provenance roots (generated/export/import/shared/archive/backup/temp/uploads/
49
+ downloads/platform names like Dropbox/OneDrive/SharePoint).
50
+ R9b_placeholder_informal: matches informal/placeholder roots with no client/company/matter cues (single common words,
51
+ “new folder”,untitled”, short tokens without digits).
52
+ R1_rootname: ROOT NAME resembles a single matter/container (matter/case/file number; client/surname/company; or a combination like “12345 Smith”, “Smith – Contract Dispute”, “Brown – Lease Review”).
54
53
  R2_initial_docs: Early-stage matter docs (cost agreement/disclosure, retainer/engagement, intake/onboarding).
55
54
  R3_legal_docs: Legal document types (agreement, contract, deed, will, affidavit, statement, advice, brief, pleadings, court forms, subpoena, orders, judgment, undertaking, notice of appeal, docket/case forms).
56
55
  R4_legal_subfolders: Typical legal subfolders (correspondence/emails, file notes/attendance notes, searches/certificates, court documents/evidence/disclosure/discovery, drafts/final/signed/executed, billing/invoices/time records).
@@ -63,16 +62,17 @@ Decision
63
62
  - Else if NOT R7_files_present → category=other (stop).
64
63
  - Else if R8_nonlegal_only → category=other (stop).
65
64
  - Else if R9_generic_rootname → category=other (stop).
66
- - Else if R1_rootname AND any of {R2_initial_docs, R3_legal_docs, R4_legal_subfolders, R6_jurisdiction} AND no multi-matter signal → category=matter.
67
- (R5_support_filename_patterns cannot be used alone to justify "matter"; it is supportive only.)
65
+ - Else if R10_weak_rootname → category=other (stop).
66
+ - Else if R1_rootname AND any of {R2, R3, R4, R6} AND no multi-matter signal category=matter.
67
+ (R5 is supportive only and cannot justify "matter" by itself.)
68
68
  - Else → category=other.
69
69
 
70
70
  Normalization
71
- - “File with extension”: name containing a period followed by a 1–5 char alphanumeric extension (e.g., .pdf, .docx). Ignore leading/trailing periods.
72
- - Treat hyphens/underscores as separators. Ignore file extensions for semantic matching beyond the presence test. Tolerate minor typos.
71
+ - “File with extension”: period + 1–5 char alphanumeric extension (e.g., .pdf, .docx). Ignore leading/trailing periods.
72
+ - Treat hyphens/underscores as separators. Ignore extensions for semantic matching beyond presence. Tolerate minor typos.
73
73
 
74
- Output format (JSON only; no prose before/after):
75
- {"category": "<matter|other>", "reasoning": "30 words citing R# and 1–2 evidence tokens>"}
74
+ Output (JSON only; no prose before/after):
75
+ {"category": "<matter|other>", "reasoning": "<≤30 words citing R# and 1–2 evidence tokens>"}
76
76
 
77
77
  FOLDER TREE:
78
78
  {folder_tree}
@@ -87,7 +87,7 @@ class FolderClassifier:
87
87
  self.logger = logging.getLogger(__name__)
88
88
  self.model_handle = serve.get_deployment_handle(app_name=app_name, deployment_name=deployment)
89
89
  self.model = model
90
- self.logger.info(f"Successfully initialized FolderClassifier")
90
+ self.logger.info(f"Successfully initialized Folder Classifier with remote Ray model: {self.model}")
91
91
 
92
92
  async def predict(self, request: FolderClassificationRequest) -> Tuple[str, str]:
93
93
  content = ""
@@ -98,7 +98,7 @@ class FolderClassifier:
98
98
  content = response_dict["choices"][0]["message"]["content"]
99
99
  result = FolderClassification.model_validate_json(content)
100
100
  except Exception as ex:
101
- self.logger.error(f"Failed to parse response: {content}\n{ex}")
101
+ self.logger.warning(f"Failed to parse response: {content}\n{ex}")
102
102
  if '"category": "matter"' in content:
103
103
  result = FolderClassification(category="matter", reasoning="NA")
104
104
  else:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: folder-classifier
3
- Version: 0.3.4
3
+ Version: 0.3.5
4
4
  Summary: Deploy folder classifier API to a Ray cluster
5
5
  Author: Crispin Almodovar
6
6
  Author-email:
@@ -11,3 +11,7 @@ Requires-Python: >=3.12
11
11
  Description-Content-Type: text/markdown
12
12
 
13
13
  # Folder classifier API
14
+
15
+ Implements a REST API for that calls an LLM-based classifier model hosted in Ray serve.
16
+
17
+ See [Build.md](Build.md) for build and deployment instructions.
@@ -0,0 +1,10 @@
1
+ folder_classifier/__init__.py,sha256=k0YWZyUNe7myJiKeX0OaXtJ30_3EGE-vsZiAUbqa-3E,46
2
+ folder_classifier/app.py,sha256=5eaniDWZFBvyI9UN4vOCc6TQwgyptAUVhRVgfrjp2F4,1259
3
+ folder_classifier/classifier.py,sha256=wUoDr8GXajKQPF_zYMeJutpCfuIYVAfAUVjsePfjcIM,8591
4
+ folder_classifier/deploy.py,sha256=06UAxz40IaP28e_RRohJoFwPUrWTaMquGbDylI-oHWA,424
5
+ folder_classifier/dto.py,sha256=KRHEfXKlsIYkBySwHOlo4k5Z4J_F3eR3PeEdUAwj8lI,819
6
+ folder_classifier/util.py,sha256=-Ptxkba5UzmhLrqoiiKZS3G56_cuAMkWlTyHcqdJkg0,3160
7
+ folder_classifier-0.3.5.dist-info/METADATA,sha256=uFoRyzCLh4Gxzak0XktEohvbfHbM7g4Ow3RMoHcZxUs,546
8
+ folder_classifier-0.3.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
+ folder_classifier-0.3.5.dist-info/top_level.txt,sha256=36ugc9pEbNQ-mnzz4Ot2WVjY3t_LzAN6XOCjDFP4p4k,18
10
+ folder_classifier-0.3.5.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- folder_classifier/__init__.py,sha256=k0YWZyUNe7myJiKeX0OaXtJ30_3EGE-vsZiAUbqa-3E,46
2
- folder_classifier/app.py,sha256=_BUFSR7TJMoylrEwImDlln98C4BciPXoPqM6SQYkc1g,1255
3
- folder_classifier/classifier.py,sha256=XTqCfUAc4-vX1bS4LixPFNMgvNNFX2mLdWXekHHB-fQ,8285
4
- folder_classifier/deploy.py,sha256=06UAxz40IaP28e_RRohJoFwPUrWTaMquGbDylI-oHWA,424
5
- folder_classifier/dto.py,sha256=KRHEfXKlsIYkBySwHOlo4k5Z4J_F3eR3PeEdUAwj8lI,819
6
- folder_classifier/util.py,sha256=-Ptxkba5UzmhLrqoiiKZS3G56_cuAMkWlTyHcqdJkg0,3160
7
- folder_classifier-0.3.4.dist-info/METADATA,sha256=YwlhWYE_vMFYfBb0lib6kHUGZjUyH3soNKQiJq0jVmY,392
8
- folder_classifier-0.3.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
- folder_classifier-0.3.4.dist-info/top_level.txt,sha256=36ugc9pEbNQ-mnzz4Ot2WVjY3t_LzAN6XOCjDFP4p4k,18
10
- folder_classifier-0.3.4.dist-info/RECORD,,