projectdavid 1.31.0__py3-none-any.whl → 1.31.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of projectdavid might be problematic. Click here for more details.
- projectdavid/clients/file_processor.py +30 -80
- {projectdavid-1.31.0.dist-info → projectdavid-1.31.1.dist-info}/METADATA +1 -2
- {projectdavid-1.31.0.dist-info → projectdavid-1.31.1.dist-info}/RECORD +6 -6
- {projectdavid-1.31.0.dist-info → projectdavid-1.31.1.dist-info}/WHEEL +0 -0
- {projectdavid-1.31.0.dist-info → projectdavid-1.31.1.dist-info}/licenses/LICENSE +0 -0
- {projectdavid-1.31.0.dist-info → projectdavid-1.31.1.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import csv
|
|
3
3
|
import json
|
|
4
|
-
import mimetypes
|
|
5
4
|
import re
|
|
6
5
|
import textwrap
|
|
7
6
|
from concurrent.futures import ThreadPoolExecutor
|
|
@@ -10,10 +9,9 @@ from typing import Any, Dict, List, Tuple, Union
|
|
|
10
9
|
|
|
11
10
|
try: # Python 3.11+
|
|
12
11
|
from typing import LiteralString
|
|
13
|
-
except ImportError: # 3.9
|
|
12
|
+
except ImportError: # 3.9–3.10
|
|
14
13
|
from typing_extensions import LiteralString
|
|
15
14
|
|
|
16
|
-
import magic
|
|
17
15
|
import numpy as np
|
|
18
16
|
import pdfplumber
|
|
19
17
|
from docx import Document
|
|
@@ -54,54 +52,30 @@ class FileProcessor:
|
|
|
54
52
|
raise ValueError(f"{file_path.name} > {mb} MB limit")
|
|
55
53
|
|
|
56
54
|
# ------------------------------------------------------------------ #
|
|
57
|
-
# File-type detection
|
|
55
|
+
# File-type detection (simple extension map – NO libmagic)
|
|
58
56
|
# ------------------------------------------------------------------ #
|
|
59
57
|
def _detect_file_type(self, file_path: Path) -> str:
|
|
60
58
|
"""
|
|
61
|
-
Return
|
|
59
|
+
Return one of:
|
|
62
60
|
|
|
63
|
-
• 'pdf'
|
|
64
|
-
• '
|
|
65
|
-
• 'text'
|
|
61
|
+
• 'pdf' • 'csv' • 'json'
|
|
62
|
+
• 'office' (.doc/.docx/.pptx)
|
|
63
|
+
• 'text' (code / markup / plain text)
|
|
66
64
|
|
|
67
|
-
Raises *ValueError*
|
|
65
|
+
Raises *ValueError* if the extension is not recognised.
|
|
68
66
|
"""
|
|
69
|
-
# 1️⃣ Best-effort MIME sniff
|
|
70
|
-
mime_type: str | None = None
|
|
71
|
-
if magic is not None:
|
|
72
|
-
try:
|
|
73
|
-
mime_type = magic.from_file(str(file_path), mime=True)
|
|
74
|
-
except Exception:
|
|
75
|
-
mime_type = None
|
|
76
|
-
|
|
77
|
-
# 2️⃣ Fallback → mimetypes
|
|
78
|
-
if not mime_type:
|
|
79
|
-
mime_type, _ = mimetypes.guess_type(file_path.name)
|
|
80
|
-
|
|
81
67
|
suffix = file_path.suffix.lower()
|
|
82
68
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
"
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
"text/markdown",
|
|
94
|
-
"text/x-python",
|
|
95
|
-
"text/x-c",
|
|
96
|
-
"text/x-c++",
|
|
97
|
-
"text/x-java-source",
|
|
98
|
-
"text/x-script.python",
|
|
99
|
-
"text/html",
|
|
100
|
-
"text/css",
|
|
101
|
-
"application/typescript",
|
|
102
|
-
"text/javascript",
|
|
103
|
-
}
|
|
104
|
-
TEXT_EXTS = {
|
|
69
|
+
if suffix == ".pdf":
|
|
70
|
+
return "pdf"
|
|
71
|
+
if suffix == ".csv":
|
|
72
|
+
return "csv"
|
|
73
|
+
if suffix == ".json":
|
|
74
|
+
return "json"
|
|
75
|
+
if suffix in {".doc", ".docx", ".pptx"}:
|
|
76
|
+
return "office"
|
|
77
|
+
|
|
78
|
+
text_exts = {
|
|
105
79
|
".txt",
|
|
106
80
|
".md",
|
|
107
81
|
".rst",
|
|
@@ -120,32 +94,10 @@ class FileProcessor:
|
|
|
120
94
|
".html",
|
|
121
95
|
".css",
|
|
122
96
|
}
|
|
123
|
-
|
|
124
|
-
# --- PDF ---
|
|
125
|
-
if mime_type in PDF_MIMES or suffix == ".pdf":
|
|
126
|
-
return "pdf"
|
|
127
|
-
|
|
128
|
-
# --- CSV ---
|
|
129
|
-
if mime_type in CSV_MIMES or suffix == ".csv":
|
|
130
|
-
return "csv"
|
|
131
|
-
|
|
132
|
-
# --- JSON ---
|
|
133
|
-
if mime_type in JSON_MIMES or suffix == ".json":
|
|
134
|
-
return "json"
|
|
135
|
-
|
|
136
|
-
# --- Office documents ---
|
|
137
|
-
if mime_type in OFFICE_MIMES or suffix in {".doc", ".docx", ".pptx"}:
|
|
138
|
-
return "office"
|
|
139
|
-
|
|
140
|
-
# --- Generic text / code / markup ---
|
|
141
|
-
if mime_type in TEXT_MIMES or suffix in TEXT_EXTS:
|
|
97
|
+
if suffix in text_exts:
|
|
142
98
|
return "text"
|
|
143
99
|
|
|
144
|
-
|
|
145
|
-
raise ValueError(
|
|
146
|
-
f"Unsupported file type for '{file_path.name}': "
|
|
147
|
-
f"MIME={mime_type or 'unknown'} extension={suffix}"
|
|
148
|
-
)
|
|
100
|
+
raise ValueError(f"Unsupported file type: {file_path.name} (ext={suffix})")
|
|
149
101
|
|
|
150
102
|
# ------------------------------------------------------------------ #
|
|
151
103
|
# Public entry-point
|
|
@@ -156,19 +108,17 @@ class FileProcessor:
|
|
|
156
108
|
self.validate_file(file_path)
|
|
157
109
|
ftype = self._detect_file_type(file_path)
|
|
158
110
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
# Safety net (should never hit)
|
|
171
|
-
raise ValueError(f"Unsupported file type: {file_path.suffix}")
|
|
111
|
+
dispatch_map = {
|
|
112
|
+
"pdf": self._process_pdf,
|
|
113
|
+
"text": self._process_text,
|
|
114
|
+
"csv": self._process_csv,
|
|
115
|
+
"office": self._process_office,
|
|
116
|
+
"json": self._process_json,
|
|
117
|
+
}
|
|
118
|
+
if ftype not in dispatch_map:
|
|
119
|
+
raise ValueError(f"Unsupported file type: {file_path.suffix}")
|
|
120
|
+
|
|
121
|
+
return await dispatch_map[ftype](file_path)
|
|
172
122
|
|
|
173
123
|
# ------------------------------------------------------------------ #
|
|
174
124
|
# PDF
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: projectdavid
|
|
3
|
-
Version: 1.31.
|
|
3
|
+
Version: 1.31.1
|
|
4
4
|
Summary: Python SDK for interacting with the Entities Assistant API.
|
|
5
5
|
Author-email: Francis Neequaye Armah <francis.neequaye@projectdavid.co.uk>
|
|
6
6
|
License: PolyForm Noncommercial License 1.0.0
|
|
@@ -27,7 +27,6 @@ Requires-Dist: validators<0.35.0,>=0.29.0
|
|
|
27
27
|
Requires-Dist: sentence-transformers<5.0,>=3.4.0
|
|
28
28
|
Requires-Dist: sseclient-py
|
|
29
29
|
Requires-Dist: requests
|
|
30
|
-
Requires-Dist: python-magic
|
|
31
30
|
Requires-Dist: python-docx
|
|
32
31
|
Requires-Dist: python-pptx
|
|
33
32
|
Provides-Extra: dev
|
|
@@ -9,7 +9,7 @@ projectdavid/clients/assistants_client.py,sha256=SsIGa5wPr7ga9WX0ywam3djUF-uWFdk
|
|
|
9
9
|
projectdavid/clients/base_client.py,sha256=UWl6nr6sxD1_xC6iyptQDR1tnNdFCOrEx5cEUPCRqJE,3417
|
|
10
10
|
projectdavid/clients/base_vector_store.py,sha256=jXivmqAW1bgYcLgIeW-hPxOiWZbs2hCsLy4oWzSvpNI,2061
|
|
11
11
|
projectdavid/clients/event_handler.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
-
projectdavid/clients/file_processor.py,sha256
|
|
12
|
+
projectdavid/clients/file_processor.py,sha256=t-Uw-kBP_VmlguMxO9PWY6ANuMAY0PstQDW37wLPF0Q,13980
|
|
13
13
|
projectdavid/clients/file_search.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
14
|
projectdavid/clients/files_client.py,sha256=XkIDzbQFGDrd88taf0Kouc_4YJOPIYEHiIyWYLKDofI,15581
|
|
15
15
|
projectdavid/clients/inference_client.py,sha256=xz4ACPv5Tkis604QxO5mJX1inH_TGDfQP-31geETYpE,6609
|
|
@@ -32,8 +32,8 @@ projectdavid/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuF
|
|
|
32
32
|
projectdavid/utils/monitor_launcher.py,sha256=3YAgJdeuaUvq3JGvpA4ymqFsAnk29nH5q93cwStP4hc,2836
|
|
33
33
|
projectdavid/utils/run_monitor.py,sha256=F_WkqIP-qnWH-4llIbileWWLfRj2Q1Cg-ni23SR1rec,3786
|
|
34
34
|
projectdavid/utils/vector_search_formatter.py,sha256=YTe3HPGec26qGY7uxY8_GS8lc4QaN6aNXMzkl29nZpI,1735
|
|
35
|
-
projectdavid-1.31.
|
|
36
|
-
projectdavid-1.31.
|
|
37
|
-
projectdavid-1.31.
|
|
38
|
-
projectdavid-1.31.
|
|
39
|
-
projectdavid-1.31.
|
|
35
|
+
projectdavid-1.31.1.dist-info/licenses/LICENSE,sha256=_8yjiEGttpS284BkfhXxfERqTRZW_tUaHiBB0GTJTMg,4563
|
|
36
|
+
projectdavid-1.31.1.dist-info/METADATA,sha256=f-SkJ06HipWaVJZ0W-bECBP7-2OjCNqTNc58kN7A0qw,10781
|
|
37
|
+
projectdavid-1.31.1.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
|
|
38
|
+
projectdavid-1.31.1.dist-info/top_level.txt,sha256=kil8GU4s7qYRfNnzGnFHhZnSNRSxgNG-J4HLgQMmMtw,13
|
|
39
|
+
projectdavid-1.31.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|