folder-classifier 0.3.7__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {folder_classifier-0.3.7 → folder_classifier-0.4.0}/PKG-INFO +1 -1
- {folder_classifier-0.3.7 → folder_classifier-0.4.0}/folder_classifier/app.py +4 -2
- {folder_classifier-0.3.7 → folder_classifier-0.4.0}/folder_classifier/classifier.py +44 -16
- {folder_classifier-0.3.7 → folder_classifier-0.4.0}/folder_classifier/deploy.py +5 -0
- {folder_classifier-0.3.7 → folder_classifier-0.4.0}/folder_classifier/dto.py +8 -1
- folder_classifier-0.4.0/folder_classifier/util.py +103 -0
- {folder_classifier-0.3.7 → folder_classifier-0.4.0}/folder_classifier.egg-info/PKG-INFO +1 -1
- {folder_classifier-0.3.7 → folder_classifier-0.4.0}/setup.cfg +1 -1
- folder_classifier-0.3.7/folder_classifier/util.py +0 -95
- {folder_classifier-0.3.7 → folder_classifier-0.4.0}/README.md +0 -0
- {folder_classifier-0.3.7 → folder_classifier-0.4.0}/folder_classifier/__init__.py +0 -0
- {folder_classifier-0.3.7 → folder_classifier-0.4.0}/folder_classifier.egg-info/SOURCES.txt +0 -0
- {folder_classifier-0.3.7 → folder_classifier-0.4.0}/folder_classifier.egg-info/dependency_links.txt +0 -0
- {folder_classifier-0.3.7 → folder_classifier-0.4.0}/folder_classifier.egg-info/top_level.txt +0 -0
- {folder_classifier-0.3.7 → folder_classifier-0.4.0}/pyproject.toml +0 -0
@@ -16,11 +16,13 @@ class FolderClassifierAPI:
|
|
16
16
|
assert model_config.app_name and model_config.deployment, "Invalid ModelConfig values"
|
17
17
|
logging.basicConfig(level=logging.INFO)
|
18
18
|
self.logger = logging.getLogger(__name__)
|
19
|
-
self.classifier = FolderClassifier(app_name=model_config.app_name, deployment=model_config.deployment, model=model_config.model
|
19
|
+
self.classifier = FolderClassifier(app_name=model_config.app_name, deployment=model_config.deployment, model=model_config.model,
|
20
|
+
fallback_config = model_config.fallback)
|
20
21
|
self.logger.info(f"Successfully initialized Folder Classifier API using config: {model_config}")
|
21
22
|
|
22
23
|
@web_api.post("/predict")
|
23
24
|
async def predict(self, request: FolderClassificationRequest) -> FolderClassificationResponse:
|
24
|
-
self.logger.info(f"Received
|
25
|
+
self.logger.info(f"Received new request")
|
25
26
|
category, reasoning = await self.classifier.predict(request)
|
27
|
+
self.logger.info(f"Request with items: {request.items} classified as '{category}' with reasoning: '{reasoning}'")
|
26
28
|
return FolderClassificationResponse(category=category, reasoning=reasoning)
|
@@ -1,31 +1,37 @@
|
|
1
1
|
import json
|
2
2
|
import logging
|
3
|
-
from typing import Tuple, Dict, Any
|
3
|
+
from typing import Tuple, Dict, Any, Optional
|
4
|
+
import os
|
4
5
|
|
6
|
+
from openai import AsyncOpenAI
|
5
7
|
from ray import serve
|
6
8
|
|
7
|
-
from folder_classifier.dto import FolderClassificationRequest, FolderClassification
|
9
|
+
from folder_classifier.dto import FolderClassificationRequest, FolderClassification, FallbackConfig
|
8
10
|
from folder_classifier.util import build_folder, render_tree
|
9
11
|
|
10
|
-
|
11
|
-
|
12
|
+
|
13
|
+
|
14
|
+
|
15
|
+
SYSTEM_PROMPT = """
|
16
|
+
You are an expert paralegal. Using only the evidence provided, decide if a root folder and its contents represent a single legal matter for a client.
|
17
|
+
Follow only the decision rules included in the user message. Respond with exactly one minified JSON object with exactly two keys: "category" and "reasoning".
|
12
18
|
- "category": either "matter" or "other" (lowercase).
|
13
19
|
- "reasoning": 1–2 short explanation referencing the key rule(s) that decided it.
|
14
20
|
No markdown (no backticks or code blocks) or any extra text outside the JSON. No chain-of-thought explanations or extra keys. If uncertain, choose "other".
|
15
21
|
""".strip()
|
16
22
|
|
17
23
|
USER_PROMPT_TEMPLATE = r"""
|
18
|
-
Task: Classify the folder
|
24
|
+
Task: Classify the root folder as 'matter' or 'other'.
|
19
25
|
|
20
26
|
Decision rules (apply in order):
|
21
27
|
1) If there are no files with extension anywhere in the tree , classify as 'other'.
|
22
28
|
2) If the root folder appears to be a container of multiple matters , classify as 'other'.
|
23
|
-
3)
|
24
|
-
|
25
|
-
|
29
|
+
3) If the root folder is or ends with a common subfolder name or descriptor found inside legal matters (e.g.,"Email", "Summons" "Emails", "Documents", "Correspondence", "Drafts", "Pleadings", "Court Documents", "Billing", or similar descriptive folder types), classify as 'other' even if it contains legal documents.
|
30
|
+
4) If the root folder is a subfolder of a matter, classify as other
|
31
|
+
5) If the root folder name matches any Matter Folder Naming Pattern and there is at least one file with an extension anywhere in the tree (including subfolders), and there is at least one file, subfolder, or filename that directly and unambiguously references a legal, client-matter, or professional context—for example, a legal document type, an initial or core legal document, clear legal terminology, a jurisdiction/court reference, the name of a law firm or legal/financial professional, or an activity specific to legal work—classify as 'matter'.
|
26
32
|
Do not classify as 'matter' if the folder only contains general business documents (e.g., invoices, estimates, generic correspondence) and there are no strong indicators of legal, client, or matter-related content as defined above
|
27
|
-
|
28
|
-
|
33
|
+
6) If the root folder name very compellingly looks like a matter folder e.g (11206 - AcmeX Pty v Acme Corp), classify as 'matter' even if the documents are not initial/core/legal documents.
|
34
|
+
7) If none of the above apply, classify as 'other'.
|
29
35
|
|
30
36
|
Matter Folder Naming Patterns (case-insensitive; separators like space, hyphen, underscore are fine):
|
31
37
|
• Matter number alone or combined with a client/surname/company (e.g., "12345", "12345 Smith", "Smith - Contract Dispute").
|
@@ -69,7 +75,8 @@ Definitions/assumptions:
|
|
69
75
|
|
70
76
|
Output format (JSON only, no extra text):
|
71
77
|
{"category": "<matter|other>", "reasoning": "<1–2 short explanation referencing the key rule(s) that decided it>"}
|
72
|
-
|
78
|
+
ROOT FOLDER:
|
79
|
+
{root_folder}
|
73
80
|
FOLDER TREE:
|
74
81
|
{folder_tree}
|
75
82
|
""".strip()
|
@@ -79,17 +86,24 @@ FOLDER_CLASSIFICATION_SCHEMA = FolderClassification.model_json_schema()
|
|
79
86
|
|
80
87
|
|
81
88
|
class FolderClassifier:
|
82
|
-
def __init__(self, app_name: str, deployment: str, model: str):
|
89
|
+
def __init__(self, app_name: str, deployment: str, model: str, fallback_config: Optional[FallbackConfig] = None):
|
83
90
|
self.logger = logging.getLogger(__name__)
|
84
91
|
self.model_handle = serve.get_deployment_handle(app_name=app_name, deployment_name=deployment)
|
85
92
|
self.model = model
|
86
|
-
self.
|
93
|
+
self.fallback_config = fallback_config
|
94
|
+
self.openai_client = AsyncOpenAI(base_url=self.fallback_config.openai_base_url, api_key=self.fallback_config.openai_api_key) \
|
95
|
+
if self.fallback_config else None
|
96
|
+
|
97
|
+
msg = f"Successfully initialized Folder Classifier with remote Ray model: {self.model}"
|
98
|
+
if self.fallback_config:
|
99
|
+
msg += f" and fallback - URL: {self.fallback_config.openai_base_url}; model: {self.fallback_config.model}"
|
100
|
+
self.logger.info(msg)
|
87
101
|
|
88
102
|
async def predict(self, request: FolderClassificationRequest) -> Tuple[str, str]:
|
89
103
|
content = ""
|
90
104
|
try:
|
91
105
|
chat_completion_request = self._to_chat_completion_request(request)
|
92
|
-
response = await self.
|
106
|
+
response = await self.run_chat_completion(chat_completion_request)
|
93
107
|
response_dict = json.loads(response.body)
|
94
108
|
content = response_dict["choices"][0]["message"]["content"]
|
95
109
|
result = FolderClassification.model_validate_json(content)
|
@@ -101,18 +115,32 @@ class FolderClassifier:
|
|
101
115
|
result = FolderClassification(category="other", reasoning="NA")
|
102
116
|
return result.category, result.reasoning
|
103
117
|
|
118
|
+
async def run_chat_completion(self, chat_completion_request: dict[str, Any]) -> Any:
|
119
|
+
response = None
|
120
|
+
try:
|
121
|
+
response = await self.model_handle.create_chat_completion_internal.remote(chat_completion_request)
|
122
|
+
except Exception as ex:
|
123
|
+
self.logger.warning(f"Failed to invoke primary model {chat_completion_request['model']}. {ex}")
|
124
|
+
if self.fallback_config:
|
125
|
+
self.logger.info(f"Invoking fallback OpenAI model: {self.fallback_config.model}")
|
126
|
+
response = await self.openai_client.chat.completions.create(**chat_completion_request)
|
127
|
+
return response
|
128
|
+
|
104
129
|
def _to_chat_completion_request(self, request: FolderClassificationRequest) -> Dict[str, Any]:
|
105
130
|
input_paths = request.items
|
106
131
|
folder = build_folder(input_paths)
|
132
|
+
root_folder = folder.name
|
107
133
|
folder_tree = render_tree(folder)
|
108
134
|
chat_completion_request = {
|
109
135
|
"model": self.model,
|
110
136
|
"messages": [
|
111
137
|
{"role": "system", "content": SYSTEM_PROMPT},
|
112
|
-
{"role": "user", "content": USER_PROMPT_TEMPLATE.
|
138
|
+
{"role": "user", "content": USER_PROMPT_TEMPLATE.
|
139
|
+
replace("{root_folder}", root_folder).
|
140
|
+
replace("{folder_tree}", folder_tree)}
|
113
141
|
],
|
114
142
|
"max_tokens": 1024,
|
115
|
-
"temperature": 0.
|
143
|
+
"temperature": 0.2,
|
116
144
|
"top_p": 0.8,
|
117
145
|
"response_format": {
|
118
146
|
"type": "json_schema",
|
@@ -2,11 +2,16 @@ from ray.serve import Application
|
|
2
2
|
|
3
3
|
from folder_classifier.app import FolderClassifierAPI
|
4
4
|
from folder_classifier.dto import AppConfig
|
5
|
+
from folder_classifier.util import get_openapi_key
|
5
6
|
|
6
7
|
|
7
8
|
def build_app(args: AppConfig) -> Application:
|
8
9
|
assert args and args.model, "AppConfig model is required"
|
9
10
|
assert args.model.app_name and args.model.deployment, "Model's app_name and deployment are required"
|
10
11
|
|
12
|
+
if args.model.fallback and args.model.fallback.openai_base_url and args.model.fallback.model:
|
13
|
+
if not args.model.fallback.openai_api_key:
|
14
|
+
args.model.fallback.openai_api_key = get_openapi_key()
|
15
|
+
|
11
16
|
app = FolderClassifierAPI.bind(args.model)
|
12
17
|
return app
|
@@ -2,10 +2,17 @@ from typing import List, Union, Literal, Optional
|
|
2
2
|
from pydantic import BaseModel, Field, ConfigDict
|
3
3
|
|
4
4
|
|
5
|
+
class FallbackConfig(BaseModel):
|
6
|
+
openai_base_url: str
|
7
|
+
openai_api_key: Optional[str] = None
|
8
|
+
model: str
|
9
|
+
|
10
|
+
|
5
11
|
class ModelConfig(BaseModel):
|
6
12
|
app_name: str
|
7
13
|
deployment: str
|
8
14
|
model: str
|
15
|
+
fallback: Optional[FallbackConfig] = None
|
9
16
|
|
10
17
|
|
11
18
|
class AppConfig(BaseModel):
|
@@ -19,7 +26,7 @@ class File(BaseModel):
|
|
19
26
|
|
20
27
|
class Folder(BaseModel):
|
21
28
|
name: str
|
22
|
-
type: Literal["
|
29
|
+
type: Literal["root_folder", "sub_folder"]
|
23
30
|
items: List[Union[File, 'Folder']] = Field(default_factory=list)
|
24
31
|
|
25
32
|
|
@@ -0,0 +1,103 @@
|
|
1
|
+
import os
|
2
|
+
from pathlib import PurePosixPath
|
3
|
+
from typing import List, Union
|
4
|
+
|
5
|
+
from folder_classifier.dto import Folder, File
|
6
|
+
import boto3
|
7
|
+
|
8
|
+
|
9
|
+
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gemini-2.0-flash")
|
10
|
+
OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", "https://openaiproxy.dev.cortoaws.com/v1")
|
11
|
+
|
12
|
+
PARAM_STORE_OPENAI_API_KEY = os.getenv("PARAM_STORE_OPENAI_API_KEY", "/AiService/OpenAiSettings/ApiKey")
|
13
|
+
AWS_REGION_NAME = os.getenv("AWS_REGION_NAME", "us-west-2")
|
14
|
+
|
15
|
+
|
16
|
+
def get_openapi_key() -> str:
|
17
|
+
boto_session = boto3.Session(region_name=AWS_REGION_NAME)
|
18
|
+
ssm_client = boto_session.client("ssm")
|
19
|
+
response = ssm_client.get_parameter(Name=PARAM_STORE_OPENAI_API_KEY, WithDecryption=True)
|
20
|
+
return response["Parameter"]["Value"]
|
21
|
+
|
22
|
+
|
23
|
+
def build_folder(paths: List[str]) -> Folder:
|
24
|
+
"""
|
25
|
+
Create a Folder tree from a list of file paths.
|
26
|
+
Assumptions:
|
27
|
+
- The file paths are delimited by "/"
|
28
|
+
- There are no '.' and '..' entries in the paths
|
29
|
+
- The paths are case-insensitive (Windows paths) -> 'ABC' and 'abc' resolve to the same item
|
30
|
+
"""
|
31
|
+
if not paths:
|
32
|
+
raise ValueError("No paths provided")
|
33
|
+
|
34
|
+
# Build a LOWER-CASED directory-prefix set so folder/file disambiguation is case-insensitive.
|
35
|
+
prefix_set_lower = set()
|
36
|
+
for p in paths:
|
37
|
+
parts = p.split('/')
|
38
|
+
for i in range(1, len(parts)):
|
39
|
+
prefix_set_lower.add('/'.join(parts[:i]).lower())
|
40
|
+
|
41
|
+
# Sort by depth so parents are created before children
|
42
|
+
sorted_paths = sorted(paths, key=lambda x: x.count('/'))
|
43
|
+
|
44
|
+
# Create the root folder (preserve first-seen casing)
|
45
|
+
root_name = sorted_paths[0].split('/')[0]
|
46
|
+
root = Folder(name=root_name, type="root_folder", items=[])
|
47
|
+
|
48
|
+
# Build the tree
|
49
|
+
for p in sorted_paths:
|
50
|
+
parts = p.split('/')
|
51
|
+
current = root
|
52
|
+
|
53
|
+
for idx, part in enumerate(parts[1:], start=1):
|
54
|
+
part_lower = part.lower()
|
55
|
+
full_path_lower = '/'.join(parts[:idx + 1]).lower()
|
56
|
+
is_last = idx == len(parts) - 1
|
57
|
+
|
58
|
+
# Case-insensitive lookup of existing child
|
59
|
+
existing = next((item for item in current.items if item.name.lower() == part_lower), None)
|
60
|
+
if existing:
|
61
|
+
if isinstance(existing, Folder):
|
62
|
+
current = existing
|
63
|
+
continue
|
64
|
+
|
65
|
+
# Determine type for new item
|
66
|
+
if is_last and full_path_lower not in prefix_set_lower:
|
67
|
+
if part.strip() in (".", ".."):
|
68
|
+
# These won't appear in the paths, ignore if they do.
|
69
|
+
continue
|
70
|
+
has_ext = bool(PurePosixPath(part).suffix)
|
71
|
+
is_dotfile = part.startswith('.') and len(part) > 1
|
72
|
+
is_file = has_ext or is_dotfile
|
73
|
+
new_item = File(name=part, type="file") if is_file else Folder(name=part, type="sub_folder", items=[])
|
74
|
+
else:
|
75
|
+
new_item = Folder(name=part, type="sub_folder", items=[])
|
76
|
+
|
77
|
+
current.items.append(new_item)
|
78
|
+
if isinstance(new_item, Folder):
|
79
|
+
current = new_item
|
80
|
+
|
81
|
+
return root
|
82
|
+
|
83
|
+
|
84
|
+
def render_tree(folder: Folder) -> str:
|
85
|
+
"""
|
86
|
+
Render Folder tree using ASCII tree characters (├──, └──, │).
|
87
|
+
"""
|
88
|
+
lines: List[str] = []
|
89
|
+
|
90
|
+
def recurse(node: Union[Folder, File], prefix: str, is_last: bool):
|
91
|
+
connector = "└── " if is_last else "├── "
|
92
|
+
lines.append(f"{prefix}{connector}{node.name}")
|
93
|
+
if isinstance(node, Folder):
|
94
|
+
child_prefix = prefix + (" " if is_last else "│ ")
|
95
|
+
for idx, child in enumerate(node.items):
|
96
|
+
recurse(child, child_prefix, idx == len(node.items) - 1)
|
97
|
+
|
98
|
+
# root
|
99
|
+
lines.append(folder.name)
|
100
|
+
for idx, child in enumerate(folder.items):
|
101
|
+
recurse(child, "", idx == len(folder.items) - 1)
|
102
|
+
|
103
|
+
return "\n".join(lines)
|
@@ -1,95 +0,0 @@
|
|
1
|
-
from typing import List, Union
|
2
|
-
|
3
|
-
from folder_classifier.dto import Folder, File
|
4
|
-
|
5
|
-
|
6
|
-
def build_folder(paths: List[str]) -> Folder:
|
7
|
-
"""
|
8
|
-
Create a Folder tree from a list of file paths;
|
9
|
-
The file paths are delimited by "/" - leaf segments are assumed to be files
|
10
|
-
"""
|
11
|
-
if not paths:
|
12
|
-
raise ValueError("No paths provided")
|
13
|
-
|
14
|
-
# Get all directory prefixes
|
15
|
-
prefix_set = set()
|
16
|
-
for p in paths:
|
17
|
-
parts = p.split('/')
|
18
|
-
for i in range(1, len(parts)):
|
19
|
-
prefix_set.add('/'.join(parts[:i]))
|
20
|
-
|
21
|
-
# Sort by depth so parents are created before children
|
22
|
-
sorted_paths = sorted(paths, key=lambda x: x.count('/'))
|
23
|
-
|
24
|
-
# Create root folder
|
25
|
-
root_name = sorted_paths[0].split('/')[0]
|
26
|
-
root = Folder(name=root_name, type="folder", items=[])
|
27
|
-
|
28
|
-
# Build the tree
|
29
|
-
for p in sorted_paths:
|
30
|
-
parts = p.split('/')
|
31
|
-
current = root
|
32
|
-
for idx, part in enumerate(parts[1:], start=1):
|
33
|
-
full_path = '/'.join(parts[:idx+1])
|
34
|
-
is_last = idx == len(parts) - 1
|
35
|
-
|
36
|
-
# existing item
|
37
|
-
existing = next((item for item in current.items if item.name == part), None)
|
38
|
-
if existing:
|
39
|
-
if isinstance(existing, Folder):
|
40
|
-
current = existing
|
41
|
-
continue
|
42
|
-
|
43
|
-
# Determine type for new item
|
44
|
-
if is_last and full_path not in prefix_set:
|
45
|
-
new_item = File(name=part, type="file")
|
46
|
-
else:
|
47
|
-
new_item = Folder(name=part, type="folder", items=[])
|
48
|
-
|
49
|
-
current.items.append(new_item)
|
50
|
-
if isinstance(new_item, Folder):
|
51
|
-
current = new_item
|
52
|
-
|
53
|
-
return root
|
54
|
-
|
55
|
-
|
56
|
-
def render_tree(folder: Folder) -> str:
|
57
|
-
"""
|
58
|
-
Render Folder tree using ASCII tree characters (├──, └──, │).
|
59
|
-
"""
|
60
|
-
lines: List[str] = []
|
61
|
-
|
62
|
-
def recurse(node: Union[Folder, File], prefix: str, is_last: bool):
|
63
|
-
connector = "└── " if is_last else "├── "
|
64
|
-
lines.append(f"{prefix}{connector}{node.name}")
|
65
|
-
if isinstance(node, Folder):
|
66
|
-
child_prefix = prefix + (" " if is_last else "│ ")
|
67
|
-
for idx, child in enumerate(node.items):
|
68
|
-
recurse(child, child_prefix, idx == len(node.items) - 1)
|
69
|
-
|
70
|
-
# root
|
71
|
-
lines.append(folder.name)
|
72
|
-
for idx, child in enumerate(folder.items):
|
73
|
-
recurse(child, "", idx == len(folder.items) - 1)
|
74
|
-
|
75
|
-
return "\n".join(lines)
|
76
|
-
|
77
|
-
|
78
|
-
def flatten_folder(folder: Folder, parent_path: str = "") -> List[str]:
|
79
|
-
"""
|
80
|
-
Traverses a Folder and returns a list of file paths.
|
81
|
-
Each path is constructed by joining folder and file names with '/'.
|
82
|
-
"""
|
83
|
-
paths: List[str] = []
|
84
|
-
# Build the path for the current folder
|
85
|
-
current_path = f"{parent_path}/{folder.name}" if parent_path else folder.name
|
86
|
-
|
87
|
-
for item in folder.items:
|
88
|
-
if item.type == "file":
|
89
|
-
paths.append(f"{current_path}/{item.name}")
|
90
|
-
else:
|
91
|
-
# Recursively flatten subfolders
|
92
|
-
paths.extend(flatten_folder(item, current_path))
|
93
|
-
return paths
|
94
|
-
|
95
|
-
|
File without changes
|
File without changes
|
File without changes
|
{folder_classifier-0.3.7 → folder_classifier-0.4.0}/folder_classifier.egg-info/dependency_links.txt
RENAMED
File without changes
|
{folder_classifier-0.3.7 → folder_classifier-0.4.0}/folder_classifier.egg-info/top_level.txt
RENAMED
File without changes
|
File without changes
|