folder-classifier 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
folder_classifier/app.py CHANGED
@@ -1,28 +1,28 @@
1
- import logging
2
-
3
- from fastapi import FastAPI
4
- from ray import serve
5
-
6
- from folder_classifier.classifier import FolderClassifier
7
- from folder_classifier.dto import ModelConfig, FolderClassificationRequest, FolderClassificationResponse
8
-
9
- web_api = FastAPI(title=f"Folder Classifier API")
10
-
11
- @serve.deployment
12
- @serve.ingress(web_api)
13
- class FolderClassifierAPI:
14
- def __init__(self, model_config: ModelConfig):
15
- assert model_config, "model_config is required"
16
- assert model_config.app_name and model_config.deployment, "Invalid ModelConfig values"
17
- logging.basicConfig(level=logging.INFO)
18
- self.logger = logging.getLogger(__name__)
19
- self.logger.info(f"Initializing Folder Classifier model: {model_config}")
20
- model_handle = serve.get_deployment_handle(app_name=model_config.app_name, deployment_name=model_config.deployment)
21
- self.classifier = FolderClassifier(model_handle)
22
- self.logger.info(f"Successfully initialized Folder Classifier API")
23
-
24
- @web_api.post("/predict")
25
- async def predict(self, request: FolderClassificationRequest) -> FolderClassificationResponse:
26
- self.logger.info(f"Received request: {request}")
27
- category, reasoning = await self.classifier.predict(request)
28
- return FolderClassificationResponse(category=category, reasoning=reasoning)
1
+ import logging
2
+
3
+ from fastapi import FastAPI
4
+ from ray import serve
5
+
6
+ from folder_classifier.classifier import FolderClassifier
7
+ from folder_classifier.dto import ModelConfig, FolderClassificationRequest, FolderClassificationResponse
8
+
9
+ web_api = FastAPI(title=f"Folder Classifier API")
10
+
11
+ @serve.deployment
12
+ @serve.ingress(web_api)
13
+ class FolderClassifierAPI:
14
+ def __init__(self, model_config: ModelConfig):
15
+ assert model_config, "model_config is required"
16
+ assert model_config.app_name and model_config.deployment, "Invalid ModelConfig values"
17
+ logging.basicConfig(level=logging.INFO)
18
+ self.logger = logging.getLogger(__name__)
19
+ self.logger.info(f"Initializing Folder Classifier model: {model_config}")
20
+ model_handle = serve.get_deployment_handle(app_name=model_config.app_name, deployment_name=model_config.deployment)
21
+ self.classifier = FolderClassifier(model_handle)
22
+ self.logger.info(f"Successfully initialized Folder Classifier API")
23
+
24
+ @web_api.post("/predict")
25
+ async def predict(self, request: FolderClassificationRequest) -> FolderClassificationResponse:
26
+ self.logger.info(f"Received request: {request}")
27
+ category, reasoning = await self.classifier.predict(request)
28
+ return FolderClassificationResponse(category=category, reasoning=reasoning)
@@ -1,129 +1,133 @@
1
- import logging
2
- import os
3
- from typing import Tuple, Dict, Any
4
-
5
- from ray.serve.handle import DeploymentHandle
6
-
7
- from folder_classifier.dto import FolderClassificationRequest, FolderClassification
8
- from folder_classifier.util import build_folder, render_tree
9
-
10
-
11
- MODEL = os.getenv("MODEL", "Qwen3-4B-Instruct-2507-classifier-FP8")
12
-
13
- SYSTEM_PROMPT = """
14
- You are a strict text classifier.
15
- Output MUST be a single JSON object with exactly two keys: "category" and "reasoning".
16
- - "category" ∈ {"matter","other"} (lowercase).
17
- - "reasoning" 30 words.
18
- - Do not include any double-quote (") characters inside the value of "reasoning"; use single quotes ' instead.
19
- - No backticks, no code fences, no extra text. Return one JSON object only.
20
- If unsure, choose "other".
21
- """.strip()
22
-
23
- USER_PROMPT_TEMPLATE = """
24
- Classify the FOLDER TREE as "matter" or "other".
25
-
26
- Definitions
27
- - matter: The FOLDER TREE represents exactly one legal matter/container (one client/case/matter) AND includes at least one legal-work indicator.
28
- - other: Anything else, including:
29
- (a) the FOLDER TREE appears to contain multiple distinct matters (a container of matters), or
30
- (b) the ROOT NAME is a common subfolder/stage/type (e.g., "Correspondence", "Drafts", "Pleadings", "Court Documents", "Billing", "Evidence"), or
31
- (c) legal-work indicators are absent, or
32
- (d) there are zero files with extensions anywhere in the tree, or
33
- (e) contents are exclusively non-legal domains (finance/accounting/IT/admin) with no legal-work indicators, or
34
- (f) the ROOT NAME is generic/system-generated and not matter-specific (see R9).
35
-
36
- Decision Rules (apply in order; case-insensitive)
37
- RC_container: Any indication the FOLDER TREE holds more than one distinct matter (e.g., multiple top-level matter-like subfolders; repeated separate matter numbers/clients) → category=other.
38
- R0_subfolder: ROOT NAME equals a common matter subfolder/stage/type (e.g., Correspondence/Emails, File Notes/Attendance Notes, Searches/Certificates, Court Documents/Pleadings/Evidence/Disclosure/Discovery, Drafts/Final/Signed/Executed, Billing/Invoices/Time Records) → category=other.
39
- R7_files_present: Must contain ≥1 file with an extension (e.g., .pdf, .docx) anywhere in the tree; if none → category=other.
40
- R8_nonlegal_only: If the tree shows strong non-legal domain signals (finance/accounting/IT/admin) and NO legal-work indicators (R2–R4 or R6), classify as other.
41
- Finance/Accounting examples: BAS/Business Activity Statement, IAS, GST, Tax Return, PAYG, Payroll, Timesheets, Payslips, Superannuation, Xero/MYOB/QuickBooks exports, General Ledger, Trial Balance, Journals, Bank Statements.
42
- IT/Systems examples: Backups, Logs, Source Code, Git, DevOps, Server/Network/VPN, Mailboxes, Google Workspace/Microsoft 365 admin.
43
- Admin-only examples: generic receipts, vendor invoices, expense folders without legal context.
44
- Note: “Billing/Invoices/Time Records” inside a matter is a typical legal subfolder; R8 applies only when legal indicators are entirely absent.
45
- R9_generic_rootname: If the ROOT NAME is generic/system-generated and not matter-specific category=other (even if legal documents appear underneath).
46
- Examples: "CORTO Generated", "Generated Files", "Exports", "Uploads", "Scans", "Shared", "Dropbox Shared", "Google Drive", "OneDrive", "SharePoint", "Archive", "Backup", "Temp", "Incoming", "Outbox", "Bulk Import".
47
- Heuristic: words like generated/export/import/sync/shared/archive/backup/temp/uploads/downloads indicate a system or generic container, not a single matter.
48
- R1_rootname: ROOT NAME resembles a single matter/container (matter/file/case number; client/surname/company; or a combination such as “12345 Smith” or “Smith – Contract Dispute”).
49
- R2_initial_docs: Early-stage matter docs (cost agreement/disclosure, retainer/engagement, intake/onboarding).
50
- R3_legal_docs: Legal document types (agreement, contract, deed, will, affidavit, statement, advice, brief, pleadings, court forms, subpoena, orders, judgment, undertaking, notice of appeal, docket/case forms).
51
- R4_legal_subfolders: Typical legal subfolders (correspondence/emails, file notes/attendance notes, searches/certificates, court documents/evidence/disclosure/discovery, drafts/final/signed/executed, billing/invoices/time records).
52
- R5_support_filename_patterns: Supportive only (not decisive): versioning (v1/v2/v3), “final”, “executed”, “signed”, eight-digit dates (YYYYMMDD/DDMMYYYY).
53
- R6_jurisdiction: Court/jurisdiction/case references (generic court acronyms, registry references, docket patterns).
54
-
55
- Decision
56
- - If RC_container category=other (stop).
57
- - Else if R0_subfolder → category=other (stop).
58
- - Else if NOT R7_files_present → category=other (stop).
59
- - Else if R8_nonlegal_only → category=other (stop).
60
- - Else if R9_generic_rootname → category=other (stop).
61
- - Else if R1_rootname AND any of {R2_initial_docs, R3_legal_docs, R4_legal_subfolders, R6_jurisdiction} AND no multi-matter signal → category=matter.
62
- (R5_support_filename_patterns cannot be used alone to justify "matter"; it is supportive only.)
63
- - Else → category=other.
64
-
65
- Normalization
66
- - “File with extension”: name containing a period followed by a 1–5 char alphanumeric extension (e.g., .pdf, .docx). Ignore leading/trailing periods.
67
- - Treat hyphens/underscores as separators. Ignore file extensions for semantic matching beyond the presence test. Tolerate minor typos.
68
-
69
- Output format (JSON only; no prose before/after):
70
- {"category": "<matter|other>", "reasoning": "≤30 words citing R# and 1–2 evidence tokens>"}
71
-
72
- FOLDER TREE:
73
- {folder_tree}
74
- """.strip()
75
-
76
-
77
- FOLDER_CLASSIFICATION_SCHEMA = FolderClassification.model_json_schema()
78
-
79
-
80
- class FolderClassifier:
81
- def __init__(self, model_handle: DeploymentHandle):
82
- self.logger = logging.getLogger(__name__)
83
- self.model_handler = model_handle
84
- self.logger.info(f"Successfully initialized FolderClassifier")
85
-
86
- async def predict(self, request: FolderClassificationRequest) -> Tuple[str, str]:
87
- try:
88
- chat_completion_request = self._to_chat_completion_request(request)
89
- response = await self.model_handler.create_chat_completion.remote(chat_completion_request)
90
- content = response.choices[0].message.content
91
- result = FolderClassification.model_validate_json(content)
92
- except Exception as ex:
93
- self.logger.error(f"Failed to parse response: {content}\n{ex}")
94
- if '"category": "matter"' in content:
95
- result = FolderClassification(category="matter", reasoning="NA")
96
- else:
97
- result = FolderClassification(category="other", reasoning="NA")
98
-
99
- return result.category, result.reasoning
100
-
101
- @staticmethod
102
- def _to_chat_completion_request(request: FolderClassificationRequest) -> Dict[str, Any]:
103
- input_paths = request.items
104
- folder = build_folder(input_paths)
105
- folder_tree = render_tree(folder)
106
- chat_completion_request = {
107
- "model": MODEL,
108
- "messages": [
109
- {"role": "system", "content": SYSTEM_PROMPT},
110
- {"role": "user", "content": USER_PROMPT_TEMPLATE.replace("{folder_tree}", folder_tree)}
111
- ],
112
- "max_tokens": 1024,
113
- "temperature": 0.7,
114
- "top_p": 0.8,
115
- "response_format": {
116
- "type": "json_schema",
117
- "json_schema": {
118
- "name": "FolderClassification",
119
- "schema": FOLDER_CLASSIFICATION_SCHEMA,
120
- "strict": True,
121
- },
122
- }
123
- }
124
- return chat_completion_request
125
-
126
-
127
-
128
-
129
-
1
+ import logging
2
+ import os
3
+ from typing import Tuple, Dict, Any
4
+
5
+ from ray.serve.handle import DeploymentHandle
6
+
7
+ from folder_classifier.dto import FolderClassificationRequest, FolderClassification
8
+ from folder_classifier.util import build_folder, render_tree
9
+ from openai import AsyncOpenAI
10
+
11
+
12
+ OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", "https://internal-multimodal-services.dev.cortoaws.com/serve/qwen3-4b-classifier/v1")
13
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "DUMMY_KEY")
14
+ MODEL = os.getenv("MODEL", "Qwen3-4B-Instruct-2507-classifier-FP8")
15
+
16
+ SYSTEM_PROMPT = """
17
+ You are a strict text classifier.
18
+ Output MUST be a single JSON object with exactly two keys: "category" and "reasoning".
19
+ - "category" {"matter","other"} (lowercase).
20
+ - "reasoning" 30 words.
21
+ - Do not include any double-quote (") characters inside the value of "reasoning"; use single quotes ' instead.
22
+ - No backticks, no code fences, no extra text. Return one JSON object only.
23
+ If unsure, choose "other".
24
+ """.strip()
25
+
26
+ USER_PROMPT_TEMPLATE = """
27
+ Classify the FOLDER TREE as "matter" or "other".
28
+
29
+ Definitions
30
+ - matter: The FOLDER TREE represents exactly one legal matter/container (one client/case/matter) AND includes at least one legal-work indicator.
31
+ - other: Anything else, including:
32
+ (a) the FOLDER TREE appears to contain multiple distinct matters (a container of matters), or
33
+ (b) the ROOT NAME is a common subfolder/stage/type (e.g., "Correspondence", "Drafts", "Pleadings", "Court Documents", "Billing", "Evidence"), or
34
+ (c) legal-work indicators are absent, or
35
+ (d) there are zero files with extensions anywhere in the tree, or
36
+ (e) contents are exclusively non-legal domains (finance/accounting/IT/admin) with no legal-work indicators, or
37
+ (f) the ROOT NAME is generic/system-generated and not matter-specific (see R9).
38
+
39
+ Decision Rules (apply in order; case-insensitive)
40
+ RC_container: Any indication the FOLDER TREE holds more than one distinct matter (e.g., multiple top-level matter-like subfolders; repeated separate matter numbers/clients) category=other.
41
+ R0_subfolder: ROOT NAME equals a common matter subfolder/stage/type (e.g., Correspondence/Emails, File Notes/Attendance Notes, Searches/Certificates, Court Documents/Pleadings/Evidence/Disclosure/Discovery, Drafts/Final/Signed/Executed, Billing/Invoices/Time Records) category=other.
42
+ R7_files_present: Must contain ≥1 file with an extension (e.g., .pdf, .docx) anywhere in the tree; if none → category=other.
43
+ R8_nonlegal_only: If the tree shows strong non-legal domain signals (finance/accounting/IT/admin) and NO legal-work indicators (R2–R4 or R6), classify as other.
44
+ Finance/Accounting examples: BAS/Business Activity Statement, IAS, GST, Tax Return, PAYG, Payroll, Timesheets, Payslips, Superannuation, Xero/MYOB/QuickBooks exports, General Ledger, Trial Balance, Journals, Bank Statements.
45
+ IT/Systems examples: Backups, Logs, Source Code, Git, DevOps, Server/Network/VPN, Mailboxes, Google Workspace/Microsoft 365 admin.
46
+ Admin-only examples: generic receipts, vendor invoices, expense folders without legal context.
47
+ Note: “Billing/Invoices/Time Records” inside a matter is a typical legal subfolder; R8 applies only when legal indicators are entirely absent.
48
+ R9_generic_rootname: If the ROOT NAME is generic/system-generated and not matter-specific category=other (even if legal documents appear underneath).
49
+ • Examples: "CORTO Generated", "Generated Files", "Exports", "Uploads", "Scans", "Shared", "Dropbox Shared", "Google Drive", "OneDrive", "SharePoint", "Archive", "Backup", "Temp", "Incoming", "Outbox", "Bulk Import".
50
+ • Heuristic: words like generated/export/import/sync/shared/archive/backup/temp/uploads/downloads indicate a system or generic container, not a single matter.
51
+ R1_rootname: ROOT NAME resembles a single matter/container (matter/file/case number; client/surname/company; or a combination such as “12345 Smith” or “Smith – Contract Dispute”).
52
+ R2_initial_docs: Early-stage matter docs (cost agreement/disclosure, retainer/engagement, intake/onboarding).
53
+ R3_legal_docs: Legal document types (agreement, contract, deed, will, affidavit, statement, advice, brief, pleadings, court forms, subpoena, orders, judgment, undertaking, notice of appeal, docket/case forms).
54
+ R4_legal_subfolders: Typical legal subfolders (correspondence/emails, file notes/attendance notes, searches/certificates, court documents/evidence/disclosure/discovery, drafts/final/signed/executed, billing/invoices/time records).
55
+ R5_support_filename_patterns: Supportive only (not decisive): versioning (v1/v2/v3), “final”, “executed”, “signed”, eight-digit dates (YYYYMMDD/DDMMYYYY).
56
+ R6_jurisdiction: Court/jurisdiction/case references (generic court acronyms, registry references, docket patterns).
57
+
58
+ Decision
59
+ - If RC_container → category=other (stop).
60
+ - Else if R0_subfolder → category=other (stop).
61
+ - Else if NOT R7_files_present → category=other (stop).
62
+ - Else if R8_nonlegal_only category=other (stop).
63
+ - Else if R9_generic_rootname → category=other (stop).
64
+ - Else if R1_rootname AND any of {R2_initial_docs, R3_legal_docs, R4_legal_subfolders, R6_jurisdiction} AND no multi-matter signal → category=matter.
65
+ (R5_support_filename_patterns cannot be used alone to justify "matter"; it is supportive only.)
66
+ - Else category=other.
67
+
68
+ Normalization
69
+ - “File with extension”: name containing a period followed by a 1–5 char alphanumeric extension (e.g., .pdf, .docx). Ignore leading/trailing periods.
70
+ - Treat hyphens/underscores as separators. Ignore file extensions for semantic matching beyond the presence test. Tolerate minor typos.
71
+
72
+ Output format (JSON only; no prose before/after):
73
+ {"category": "<matter|other>", "reasoning": "≤30 words citing R# and 1–2 evidence tokens>"}
74
+
75
+ FOLDER TREE:
76
+ {folder_tree}
77
+ """.strip()
78
+
79
+
80
+ FOLDER_CLASSIFICATION_SCHEMA = FolderClassification.model_json_schema()
81
+
82
+
83
+ class FolderClassifier:
84
+ def __init__(self, model_handle: DeploymentHandle):
85
+ self.logger = logging.getLogger(__name__)
86
+ self.model_handle = model_handle
87
+ self.llm = AsyncOpenAI(base_url=OPENAI_BASE_URL, api_key=OPENAI_API_KEY)
88
+ self.logger.info(f"Successfully initialized FolderClassifier")
89
+
90
+ async def predict(self, request: FolderClassificationRequest) -> Tuple[str, str]:
91
+ content = ""
92
+ try:
93
+ chat_completion_request = self._to_chat_completion_request(request)
94
+ response = await self.llm.chat.completions.create(**chat_completion_request)
95
+ content = response.choices[0].message.content
96
+ result = FolderClassification.model_validate_json(content)
97
+ except Exception as ex:
98
+ self.logger.error(f"Failed to parse response: {content}\n{ex}")
99
+ if '"category": "matter"' in content:
100
+ result = FolderClassification(category="matter", reasoning="NA")
101
+ else:
102
+ result = FolderClassification(category="other", reasoning="NA")
103
+ return result.category, result.reasoning
104
+
105
+ @staticmethod
106
+ def _to_chat_completion_request(request: FolderClassificationRequest) -> Dict[str, Any]:
107
+ input_paths = request.items
108
+ folder = build_folder(input_paths)
109
+ folder_tree = render_tree(folder)
110
+ chat_completion_request = {
111
+ "model": MODEL,
112
+ "messages": [
113
+ {"role": "system", "content": SYSTEM_PROMPT},
114
+ {"role": "user", "content": USER_PROMPT_TEMPLATE.replace("{folder_tree}", folder_tree)}
115
+ ],
116
+ "max_tokens": 1024,
117
+ "temperature": 0.7,
118
+ "top_p": 0.8,
119
+ "response_format": {
120
+ "type": "json_schema",
121
+ "json_schema": {
122
+ "name": "FolderClassification",
123
+ "schema": FOLDER_CLASSIFICATION_SCHEMA,
124
+ "strict": True,
125
+ },
126
+ }
127
+ }
128
+ return chat_completion_request
129
+
130
+
131
+
132
+
133
+
@@ -1,12 +1,12 @@
1
- from ray.serve import Application
2
-
3
- from folder_classifier.app import FolderClassifierAPI
4
- from folder_classifier.dto import AppConfig
5
-
6
-
7
- def build_app(args: AppConfig) -> Application:
8
- assert args and args.model, "AppConfig model is required"
9
- assert args.model.app_name and args.model.deployment, "Model's app_name and deployment are required"
10
-
11
- app = FolderClassifierAPI.bind(args.model)
12
- return app
1
+ from ray.serve import Application
2
+
3
+ from folder_classifier.app import FolderClassifierAPI
4
+ from folder_classifier.dto import AppConfig
5
+
6
+
7
+ def build_app(args: AppConfig) -> Application:
8
+ assert args and args.model, "AppConfig model is required"
9
+ assert args.model.app_name and args.model.deployment, "Model's app_name and deployment are required"
10
+
11
+ app = FolderClassifierAPI.bind(args.model)
12
+ return app
folder_classifier/dto.py CHANGED
@@ -1,37 +1,37 @@
1
- from typing import List, Union, Literal, Optional
2
- from pydantic import BaseModel, Field, ConfigDict
3
-
4
-
5
- class ModelConfig(BaseModel):
6
- app_name: str
7
- deployment: str
8
-
9
-
10
- class AppConfig(BaseModel):
11
- model: ModelConfig
12
-
13
-
14
- class File(BaseModel):
15
- name: str
16
- type: Literal["file"]
17
-
18
-
19
- class Folder(BaseModel):
20
- name: str
21
- type: Literal["folder"]
22
- items: List[Union[File, 'Folder']] = Field(default_factory=list)
23
-
24
-
25
- class FolderClassificationRequest(BaseModel):
26
- items: List[str]
27
-
28
-
29
- class FolderClassificationResponse(BaseModel):
30
- category: Literal["matter", "other"]
31
- reasoning: Optional[str] = None
32
-
33
-
34
- class FolderClassification(BaseModel):
35
- category: Literal["matter", "other"]
36
- reasoning: str
1
+ from typing import List, Union, Literal, Optional
2
+ from pydantic import BaseModel, Field, ConfigDict
3
+
4
+
5
+ class ModelConfig(BaseModel):
6
+ app_name: str
7
+ deployment: str
8
+
9
+
10
+ class AppConfig(BaseModel):
11
+ model: ModelConfig
12
+
13
+
14
+ class File(BaseModel):
15
+ name: str
16
+ type: Literal["file"]
17
+
18
+
19
+ class Folder(BaseModel):
20
+ name: str
21
+ type: Literal["folder"]
22
+ items: List[Union[File, 'Folder']] = Field(default_factory=list)
23
+
24
+
25
+ class FolderClassificationRequest(BaseModel):
26
+ items: List[str]
27
+
28
+
29
+ class FolderClassificationResponse(BaseModel):
30
+ category: Literal["matter", "other"]
31
+ reasoning: Optional[str] = None
32
+
33
+
34
+ class FolderClassification(BaseModel):
35
+ category: Literal["matter", "other"]
36
+ reasoning: str
37
37
  model_config = ConfigDict(extra="forbid")
folder_classifier/util.py CHANGED
@@ -1,95 +1,95 @@
1
- from typing import List, Union
2
-
3
- from folder_classifier.dto import Folder, File
4
-
5
-
6
- def build_folder(paths: List[str]) -> Folder:
7
- """
8
- Create a Folder tree from a list of file paths;
9
- The file paths are delimited by "/" - leaf segments are assumed to be files
10
- """
11
- if not paths:
12
- raise ValueError("No paths provided")
13
-
14
- # Get all directory prefixes
15
- prefix_set = set()
16
- for p in paths:
17
- parts = p.split('/')
18
- for i in range(1, len(parts)):
19
- prefix_set.add('/'.join(parts[:i]))
20
-
21
- # Sort by depth so parents are created before children
22
- sorted_paths = sorted(paths, key=lambda x: x.count('/'))
23
-
24
- # Create root folder
25
- root_name = sorted_paths[0].split('/')[0]
26
- root = Folder(name=root_name, type="folder", items=[])
27
-
28
- # Build the tree
29
- for p in sorted_paths:
30
- parts = p.split('/')
31
- current = root
32
- for idx, part in enumerate(parts[1:], start=1):
33
- full_path = '/'.join(parts[:idx+1])
34
- is_last = idx == len(parts) - 1
35
-
36
- # existing item
37
- existing = next((item for item in current.items if item.name == part), None)
38
- if existing:
39
- if isinstance(existing, Folder):
40
- current = existing
41
- continue
42
-
43
- # Determine type for new item
44
- if is_last and full_path not in prefix_set:
45
- new_item = File(name=part, type="file")
46
- else:
47
- new_item = Folder(name=part, type="folder", items=[])
48
-
49
- current.items.append(new_item)
50
- if isinstance(new_item, Folder):
51
- current = new_item
52
-
53
- return root
54
-
55
-
56
- def render_tree(folder: Folder) -> str:
57
- """
58
- Render Folder tree using ASCII tree characters (├──, └──, │).
59
- """
60
- lines: List[str] = []
61
-
62
- def recurse(node: Union[Folder, File], prefix: str, is_last: bool):
63
- connector = "└── " if is_last else "├── "
64
- lines.append(f"{prefix}{connector}{node.name}")
65
- if isinstance(node, Folder):
66
- child_prefix = prefix + (" " if is_last else "│ ")
67
- for idx, child in enumerate(node.items):
68
- recurse(child, child_prefix, idx == len(node.items) - 1)
69
-
70
- # root
71
- lines.append(folder.name)
72
- for idx, child in enumerate(folder.items):
73
- recurse(child, "", idx == len(folder.items) - 1)
74
-
75
- return "\n".join(lines)
76
-
77
-
78
- def flatten_folder(folder: Folder, parent_path: str = "") -> List[str]:
79
- """
80
- Traverses a Folder and returns a list of file paths.
81
- Each path is constructed by joining folder and file names with '/'.
82
- """
83
- paths: List[str] = []
84
- # Build the path for the current folder
85
- current_path = f"{parent_path}/{folder.name}" if parent_path else folder.name
86
-
87
- for item in folder.items:
88
- if item.type == "file":
89
- paths.append(f"{current_path}/{item.name}")
90
- else:
91
- # Recursively flatten subfolders
92
- paths.extend(flatten_folder(item, current_path))
93
- return paths
94
-
95
-
1
+ from typing import List, Union
2
+
3
+ from folder_classifier.dto import Folder, File
4
+
5
+
6
+ def build_folder(paths: List[str]) -> Folder:
7
+ """
8
+ Create a Folder tree from a list of file paths;
9
+ The file paths are delimited by "/" - leaf segments are assumed to be files
10
+ """
11
+ if not paths:
12
+ raise ValueError("No paths provided")
13
+
14
+ # Get all directory prefixes
15
+ prefix_set = set()
16
+ for p in paths:
17
+ parts = p.split('/')
18
+ for i in range(1, len(parts)):
19
+ prefix_set.add('/'.join(parts[:i]))
20
+
21
+ # Sort by depth so parents are created before children
22
+ sorted_paths = sorted(paths, key=lambda x: x.count('/'))
23
+
24
+ # Create root folder
25
+ root_name = sorted_paths[0].split('/')[0]
26
+ root = Folder(name=root_name, type="folder", items=[])
27
+
28
+ # Build the tree
29
+ for p in sorted_paths:
30
+ parts = p.split('/')
31
+ current = root
32
+ for idx, part in enumerate(parts[1:], start=1):
33
+ full_path = '/'.join(parts[:idx+1])
34
+ is_last = idx == len(parts) - 1
35
+
36
+ # existing item
37
+ existing = next((item for item in current.items if item.name == part), None)
38
+ if existing:
39
+ if isinstance(existing, Folder):
40
+ current = existing
41
+ continue
42
+
43
+ # Determine type for new item
44
+ if is_last and full_path not in prefix_set:
45
+ new_item = File(name=part, type="file")
46
+ else:
47
+ new_item = Folder(name=part, type="folder", items=[])
48
+
49
+ current.items.append(new_item)
50
+ if isinstance(new_item, Folder):
51
+ current = new_item
52
+
53
+ return root
54
+
55
+
56
+ def render_tree(folder: Folder) -> str:
57
+ """
58
+ Render Folder tree using ASCII tree characters (├──, └──, │).
59
+ """
60
+ lines: List[str] = []
61
+
62
+ def recurse(node: Union[Folder, File], prefix: str, is_last: bool):
63
+ connector = "└── " if is_last else "├── "
64
+ lines.append(f"{prefix}{connector}{node.name}")
65
+ if isinstance(node, Folder):
66
+ child_prefix = prefix + (" " if is_last else "│ ")
67
+ for idx, child in enumerate(node.items):
68
+ recurse(child, child_prefix, idx == len(node.items) - 1)
69
+
70
+ # root
71
+ lines.append(folder.name)
72
+ for idx, child in enumerate(folder.items):
73
+ recurse(child, "", idx == len(folder.items) - 1)
74
+
75
+ return "\n".join(lines)
76
+
77
+
78
+ def flatten_folder(folder: Folder, parent_path: str = "") -> List[str]:
79
+ """
80
+ Traverses a Folder and returns a list of file paths.
81
+ Each path is constructed by joining folder and file names with '/'.
82
+ """
83
+ paths: List[str] = []
84
+ # Build the path for the current folder
85
+ current_path = f"{parent_path}/{folder.name}" if parent_path else folder.name
86
+
87
+ for item in folder.items:
88
+ if item.type == "file":
89
+ paths.append(f"{current_path}/{item.name}")
90
+ else:
91
+ # Recursively flatten subfolders
92
+ paths.extend(flatten_folder(item, current_path))
93
+ return paths
94
+
95
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: folder-classifier
3
- Version: 0.3.1
3
+ Version: 0.3.3
4
4
  Summary: Deploy folder classifier API to a Ray cluster
5
5
  Author: Crispin Almodovar
6
6
  Author-email:
@@ -0,0 +1,10 @@
1
+ folder_classifier/__init__.py,sha256=k0YWZyUNe7myJiKeX0OaXtJ30_3EGE-vsZiAUbqa-3E,46
2
+ folder_classifier/app.py,sha256=Xp3qkxQDWhFJKINMSMiDPtn1EEJrGm7XdTJAPZz6I4w,1358
3
+ folder_classifier/classifier.py,sha256=mBt6QvWIWiKdfwqRXog_p0nRqEnZ6mKwgGf8VADx930,8146
4
+ folder_classifier/deploy.py,sha256=06UAxz40IaP28e_RRohJoFwPUrWTaMquGbDylI-oHWA,424
5
+ folder_classifier/dto.py,sha256=bVai09FuPXktOo5Y_9dzXXj0ZvriDwZOho0ZYmWvc7E,803
6
+ folder_classifier/util.py,sha256=-Ptxkba5UzmhLrqoiiKZS3G56_cuAMkWlTyHcqdJkg0,3160
7
+ folder_classifier-0.3.3.dist-info/METADATA,sha256=Tj220ru3uyno5el6TPyEJ12ew8pDzzdA-hIHKkD7Y8Y,392
8
+ folder_classifier-0.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
+ folder_classifier-0.3.3.dist-info/top_level.txt,sha256=36ugc9pEbNQ-mnzz4Ot2WVjY3t_LzAN6XOCjDFP4p4k,18
10
+ folder_classifier-0.3.3.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- folder_classifier/__init__.py,sha256=k0YWZyUNe7myJiKeX0OaXtJ30_3EGE-vsZiAUbqa-3E,46
2
- folder_classifier/app.py,sha256=doaXkKnpe-LNztkMkuROutjId7XPS_t9G7m9dSXt-ow,1330
3
- folder_classifier/classifier.py,sha256=3V4M-x5K7vASStsnPlrW2M9tA6uWsKA4L1rCMHqCQ_M,7705
4
- folder_classifier/deploy.py,sha256=UQTbQjR_JX92Xo8L5EbK4nPg_VJmwWDmzMQSjP3-7iQ,412
5
- folder_classifier/dto.py,sha256=pCRErTIflx8zMe7dD_GcnaBagPYpuZy2vIZCAbb0i0Q,767
6
- folder_classifier/util.py,sha256=4zk52bPVyoVU0FfDTmeK5JdM4y0IbnuaPS1PWc9txa4,3065
7
- folder_classifier-0.3.1.dist-info/METADATA,sha256=x4Dv4CCUXoo0GAL5DX3I3FPrHvTWYKowY1UrkMGD2Gs,392
8
- folder_classifier-0.3.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
- folder_classifier-0.3.1.dist-info/top_level.txt,sha256=36ugc9pEbNQ-mnzz4Ot2WVjY3t_LzAN6XOCjDFP4p4k,18
10
- folder_classifier-0.3.1.dist-info/RECORD,,