projectdavid 1.31.0__py3-none-any.whl → 1.31.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of projectdavid might be problematic. Click here for more details.

@@ -1,7 +1,6 @@
1
1
  import asyncio
2
2
  import csv
3
3
  import json
4
- import mimetypes
5
4
  import re
6
5
  import textwrap
7
6
  from concurrent.futures import ThreadPoolExecutor
@@ -10,10 +9,9 @@ from typing import Any, Dict, List, Tuple, Union
10
9
 
11
10
  try: # Python 3.11+
12
11
  from typing import LiteralString
13
- except ImportError: # 3.9 - 3.10
12
+ except ImportError: # 3.93.10
14
13
  from typing_extensions import LiteralString
15
14
 
16
- import magic
17
15
  import numpy as np
18
16
  import pdfplumber
19
17
  from docx import Document
@@ -54,54 +52,30 @@ class FileProcessor:
54
52
  raise ValueError(f"{file_path.name} > {mb} MB limit")
55
53
 
56
54
  # ------------------------------------------------------------------ #
57
- # File-type detection (extension + MIME)
55
+ # File-type detection (simple extension map – NO libmagic)
58
56
  # ------------------------------------------------------------------ #
59
57
  def _detect_file_type(self, file_path: Path) -> str:
60
58
  """
61
- Return a handler tag:
59
+ Return one of:
62
60
 
63
- • 'pdf' • 'csv'
64
- • 'json' 'office'
65
- • 'text'
61
+ • 'pdf' • 'csv' • 'json'
62
+ • 'office' (.doc/.docx/.pptx)
63
+ • 'text' (code / markup / plain text)
66
64
 
67
- Raises *ValueError* on anything unknown.
65
+ Raises *ValueError* if the extension is not recognised.
68
66
  """
69
- # 1️⃣ Best-effort MIME sniff
70
- mime_type: str | None = None
71
- if magic is not None:
72
- try:
73
- mime_type = magic.from_file(str(file_path), mime=True)
74
- except Exception:
75
- mime_type = None
76
-
77
- # 2️⃣ Fallback → mimetypes
78
- if not mime_type:
79
- mime_type, _ = mimetypes.guess_type(file_path.name)
80
-
81
67
  suffix = file_path.suffix.lower()
82
68
 
83
- PDF_MIMES = {"application/pdf"}
84
- CSV_MIMES = {"text/csv", "application/csv"}
85
- JSON_MIMES = {"application/json"}
86
- OFFICE_MIMES = {
87
- "application/msword",
88
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
89
- "application/vnd.openxmlformats-officedocument.presentationml.presentation",
90
- }
91
- TEXT_MIMES = {
92
- "text/plain",
93
- "text/markdown",
94
- "text/x-python",
95
- "text/x-c",
96
- "text/x-c++",
97
- "text/x-java-source",
98
- "text/x-script.python",
99
- "text/html",
100
- "text/css",
101
- "application/typescript",
102
- "text/javascript",
103
- }
104
- TEXT_EXTS = {
69
+ if suffix == ".pdf":
70
+ return "pdf"
71
+ if suffix == ".csv":
72
+ return "csv"
73
+ if suffix == ".json":
74
+ return "json"
75
+ if suffix in {".doc", ".docx", ".pptx"}:
76
+ return "office"
77
+
78
+ text_exts = {
105
79
  ".txt",
106
80
  ".md",
107
81
  ".rst",
@@ -120,32 +94,10 @@ class FileProcessor:
120
94
  ".html",
121
95
  ".css",
122
96
  }
123
-
124
- # --- PDF ---
125
- if mime_type in PDF_MIMES or suffix == ".pdf":
126
- return "pdf"
127
-
128
- # --- CSV ---
129
- if mime_type in CSV_MIMES or suffix == ".csv":
130
- return "csv"
131
-
132
- # --- JSON ---
133
- if mime_type in JSON_MIMES or suffix == ".json":
134
- return "json"
135
-
136
- # --- Office documents ---
137
- if mime_type in OFFICE_MIMES or suffix in {".doc", ".docx", ".pptx"}:
138
- return "office"
139
-
140
- # --- Generic text / code / markup ---
141
- if mime_type in TEXT_MIMES or suffix in TEXT_EXTS:
97
+ if suffix in text_exts:
142
98
  return "text"
143
99
 
144
- # --- Unsupported ---
145
- raise ValueError(
146
- f"Unsupported file type for '{file_path.name}': "
147
- f"MIME={mime_type or 'unknown'} extension={suffix}"
148
- )
100
+ raise ValueError(f"Unsupported file type: {file_path.name} (ext={suffix})")
149
101
 
150
102
  # ------------------------------------------------------------------ #
151
103
  # Public entry-point
@@ -156,19 +108,17 @@ class FileProcessor:
156
108
  self.validate_file(file_path)
157
109
  ftype = self._detect_file_type(file_path)
158
110
 
159
- if ftype == "pdf":
160
- return await self._process_pdf(file_path)
161
- if ftype == "text":
162
- return await self._process_text(file_path)
163
- if ftype == "csv":
164
- return await self._process_csv(file_path)
165
- if ftype == "office":
166
- return await self._process_office(file_path)
167
- if ftype == "json":
168
- return await self._process_json(file_path)
169
-
170
- # Safety net (should never hit)
171
- raise ValueError(f"Unsupported file type: {file_path.suffix}")
111
+ dispatch_map = {
112
+ "pdf": self._process_pdf,
113
+ "text": self._process_text,
114
+ "csv": self._process_csv,
115
+ "office": self._process_office,
116
+ "json": self._process_json,
117
+ }
118
+ if ftype not in dispatch_map:
119
+ raise ValueError(f"Unsupported file type: {file_path.suffix}")
120
+
121
+ return await dispatch_map[ftype](file_path)
172
122
 
173
123
  # ------------------------------------------------------------------ #
174
124
  # PDF
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: projectdavid
3
- Version: 1.31.0
3
+ Version: 1.31.1
4
4
  Summary: Python SDK for interacting with the Entities Assistant API.
5
5
  Author-email: Francis Neequaye Armah <francis.neequaye@projectdavid.co.uk>
6
6
  License: PolyForm Noncommercial License 1.0.0
@@ -27,7 +27,6 @@ Requires-Dist: validators<0.35.0,>=0.29.0
27
27
  Requires-Dist: sentence-transformers<5.0,>=3.4.0
28
28
  Requires-Dist: sseclient-py
29
29
  Requires-Dist: requests
30
- Requires-Dist: python-magic
31
30
  Requires-Dist: python-docx
32
31
  Requires-Dist: python-pptx
33
32
  Provides-Extra: dev
@@ -9,7 +9,7 @@ projectdavid/clients/assistants_client.py,sha256=SsIGa5wPr7ga9WX0ywam3djUF-uWFdk
9
9
  projectdavid/clients/base_client.py,sha256=UWl6nr6sxD1_xC6iyptQDR1tnNdFCOrEx5cEUPCRqJE,3417
10
10
  projectdavid/clients/base_vector_store.py,sha256=jXivmqAW1bgYcLgIeW-hPxOiWZbs2hCsLy4oWzSvpNI,2061
11
11
  projectdavid/clients/event_handler.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
- projectdavid/clients/file_processor.py,sha256=-dRibUVVfGXjPRP3P2kzJaRZYvagIUKgDmfmi96V45w,15586
12
+ projectdavid/clients/file_processor.py,sha256=t-Uw-kBP_VmlguMxO9PWY6ANuMAY0PstQDW37wLPF0Q,13980
13
13
  projectdavid/clients/file_search.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
14
  projectdavid/clients/files_client.py,sha256=XkIDzbQFGDrd88taf0Kouc_4YJOPIYEHiIyWYLKDofI,15581
15
15
  projectdavid/clients/inference_client.py,sha256=xz4ACPv5Tkis604QxO5mJX1inH_TGDfQP-31geETYpE,6609
@@ -32,8 +32,8 @@ projectdavid/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuF
32
32
  projectdavid/utils/monitor_launcher.py,sha256=3YAgJdeuaUvq3JGvpA4ymqFsAnk29nH5q93cwStP4hc,2836
33
33
  projectdavid/utils/run_monitor.py,sha256=F_WkqIP-qnWH-4llIbileWWLfRj2Q1Cg-ni23SR1rec,3786
34
34
  projectdavid/utils/vector_search_formatter.py,sha256=YTe3HPGec26qGY7uxY8_GS8lc4QaN6aNXMzkl29nZpI,1735
35
- projectdavid-1.31.0.dist-info/licenses/LICENSE,sha256=_8yjiEGttpS284BkfhXxfERqTRZW_tUaHiBB0GTJTMg,4563
36
- projectdavid-1.31.0.dist-info/METADATA,sha256=we616BQkChiuQYO_-UfQ1VL-7j-IfDfPI7OtheEMsUM,10809
37
- projectdavid-1.31.0.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
38
- projectdavid-1.31.0.dist-info/top_level.txt,sha256=kil8GU4s7qYRfNnzGnFHhZnSNRSxgNG-J4HLgQMmMtw,13
39
- projectdavid-1.31.0.dist-info/RECORD,,
35
+ projectdavid-1.31.1.dist-info/licenses/LICENSE,sha256=_8yjiEGttpS284BkfhXxfERqTRZW_tUaHiBB0GTJTMg,4563
36
+ projectdavid-1.31.1.dist-info/METADATA,sha256=f-SkJ06HipWaVJZ0W-bECBP7-2OjCNqTNc58kN7A0qw,10781
37
+ projectdavid-1.31.1.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
38
+ projectdavid-1.31.1.dist-info/top_level.txt,sha256=kil8GU4s7qYRfNnzGnFHhZnSNRSxgNG-J4HLgQMmMtw,13
39
+ projectdavid-1.31.1.dist-info/RECORD,,