chunkr-ai 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
chunkr_ai/__init__.py CHANGED
@@ -1,3 +1,14 @@
1
1
  from .api.chunkr import Chunkr
2
+ import tomllib
3
+ from pathlib import Path
2
4
 
3
- __all__ = ["Chunkr"]
5
+ # Read version from pyproject.toml
6
+ try:
7
+ pyproject_path = Path(__file__).parent.parent.parent / "pyproject.toml"
8
+ with open(pyproject_path, "rb") as f:
9
+ pyproject_data = tomllib.load(f)
10
+ __version__ = pyproject_data["project"]["version"]
11
+ except Exception:
12
+ __version__ = "unknown"
13
+
14
+ __all__ = ["Chunkr", "__version__"]
chunkr_ai/api/auth.py CHANGED
@@ -1,3 +1,28 @@
1
+ import platform
2
+ import sys
3
+ import tomllib
4
+ from pathlib import Path
5
+
6
+ def _find_pyproject_toml(start_path: Path) -> Path | None:
7
+ """Search for pyproject.toml in current and parent directories."""
8
+ for parent in [start_path, *start_path.parents]:
9
+ candidate = parent / "pyproject.toml"
10
+ if candidate.is_file():
11
+ return candidate
12
+ return None
13
+
14
+ # Read version from pyproject.toml
15
+ try:
16
+ pyproject_path = _find_pyproject_toml(Path(__file__).resolve().parent)
17
+ if pyproject_path is not None:
18
+ with open(pyproject_path, "rb") as f:
19
+ pyproject_data = tomllib.load(f)
20
+ __version__ = pyproject_data["project"]["version"]
21
+ else:
22
+ __version__ = "unknown"
23
+ except Exception:
24
+ __version__ = "unknown"
25
+
1
26
  class HeadersMixin:
2
27
  """Mixin class for handling authorization headers"""
3
28
  _api_key: str = ""
@@ -9,5 +34,9 @@ class HeadersMixin:
9
34
  return self._api_key
10
35
 
11
36
  def _headers(self) -> dict:
12
- """Generate authorization headers"""
13
- return {"Authorization": self.get_api_key()}
37
+ """Generate authorization headers and version information"""
38
+ user_agent = f"chunkr-ai/{__version__} (Python/{sys.version.split()[0]}; {platform.system()}/{platform.release()})"
39
+ return {
40
+ "Authorization": self.get_api_key(),
41
+ "User-Agent": user_agent
42
+ }
chunkr_ai/api/chunkr.py CHANGED
@@ -42,7 +42,7 @@ class Chunkr(ChunkrBase):
42
42
  data = await prepare_upload_data(file, filename, config)
43
43
  assert self._client is not None
44
44
  r = await self._client.post(
45
- f"{self.url}/api/v1/task/parse", json=data, headers=self._headers()
45
+ f"{self.url}/task/parse", json=data, headers=self._headers()
46
46
  )
47
47
  r.raise_for_status()
48
48
  return TaskResponse(**r.json()).with_client(cast(ChunkrClientProtocol, self), True, False)
@@ -55,7 +55,7 @@ class Chunkr(ChunkrBase):
55
55
  data = await prepare_upload_data(None, None, config)
56
56
  assert self._client is not None
57
57
  r = await self._client.patch(
58
- f"{self.url}/api/v1/task/{task_id}/parse",
58
+ f"{self.url}/task/{task_id}/parse",
59
59
  json=data,
60
60
  headers=self._headers(),
61
61
  )
@@ -71,7 +71,7 @@ class Chunkr(ChunkrBase):
71
71
  }
72
72
  assert self._client is not None
73
73
  r = await self._client.get(
74
- f"{self.url}/api/v1/task/{task_id}",
74
+ f"{self.url}/task/{task_id}",
75
75
  params=params,
76
76
  headers=self._headers()
77
77
  )
@@ -83,7 +83,7 @@ class Chunkr(ChunkrBase):
83
83
  async def delete_task(self, task_id: str) -> None:
84
84
  assert self._client is not None
85
85
  r = await self._client.delete(
86
- f"{self.url}/api/v1/task/{task_id}", headers=self._headers()
86
+ f"{self.url}/task/{task_id}", headers=self._headers()
87
87
  )
88
88
  r.raise_for_status()
89
89
 
@@ -92,7 +92,7 @@ class Chunkr(ChunkrBase):
92
92
  async def cancel_task(self, task_id: str) -> None:
93
93
  assert self._client is not None
94
94
  r = await self._client.get(
95
- f"{self.url}/api/v1/task/{task_id}/cancel", headers=self._headers()
95
+ f"{self.url}/task/{task_id}/cancel", headers=self._headers()
96
96
  )
97
97
  r.raise_for_status()
98
98
 
@@ -1,6 +1,6 @@
1
1
  from pydantic import BaseModel, Field, ConfigDict
2
2
  from enum import Enum
3
- from typing import Any, List, Optional, Union
3
+ from typing import List, Optional, Union
4
4
  from pydantic import field_validator, field_serializer
5
5
 
6
6
  class CroppingStrategy(str, Enum):
@@ -20,15 +20,17 @@ class EmbedSource(str, Enum):
20
20
  class GenerationStrategy(str, Enum):
21
21
  LLM = "LLM"
22
22
  AUTO = "Auto"
23
+ IGNORE = "Ignore"
23
24
 
24
25
  class GenerationConfig(BaseModel):
25
26
  format: Optional[SegmentFormat] = None
26
27
  strategy: Optional[GenerationStrategy] = None
27
- llm: Optional[str] = None
28
28
  crop_image: Optional[CroppingStrategy] = None
29
- embed_sources: Optional[List[EmbedSource]] = None
30
29
  extended_context: Optional[bool] = None
30
+ description: Optional[bool] = None
31
31
  # Deprecated fields for backwards compatibility
32
+ llm: Optional[str] = None # Deprecated
33
+ embed_sources: Optional[List[EmbedSource]] = None # Deprecated
32
34
  html: Optional[GenerationStrategy] = None # Deprecated: Use format=SegmentFormat.HTML and strategy instead
33
35
  markdown: Optional[GenerationStrategy] = None # Deprecated: Use format=SegmentFormat.MARKDOWN and strategy instead
34
36
 
@@ -83,7 +85,7 @@ class TokenizerType(BaseModel):
83
85
  return {}
84
86
 
85
87
  class ChunkProcessing(BaseModel):
86
- ignore_headers_and_footers: Optional[bool] = True
88
+ ignore_headers_and_footers: Optional[bool] = None # Deprecated
87
89
  target_length: Optional[int] = None
88
90
  tokenizer: Optional[Union[TokenizerType, Tokenizer, str]] = None
89
91
 
@@ -286,6 +288,7 @@ class Page(BaseModel):
286
288
  page_height: float
287
289
  page_width: float
288
290
  ss_sheet_name: Optional[str] = None
291
+ dpi: Optional[float] = None
289
292
 
290
293
  class Segment(BaseModel):
291
294
  bbox: BoundingBox
@@ -303,6 +306,8 @@ class Segment(BaseModel):
303
306
  confidence: Optional[float]
304
307
  text: str = ""
305
308
  segment_length: Optional[int] = None
309
+ embed: Optional[str] = None
310
+ description: Optional[str] = None
306
311
  # Spreadsheet-specific fields
307
312
  ss_cells: Optional[List[Cell]] = None
308
313
  ss_header_bbox: Optional[BoundingBox] = None
@@ -317,6 +322,7 @@ class Chunk(BaseModel):
317
322
  chunk_length: int
318
323
  segments: List[Segment]
319
324
  embed: Optional[str] = None
325
+ content: Optional[str] = None
320
326
 
321
327
  class OutputResponse(BaseModel):
322
328
  chunks: List[Chunk]
@@ -347,10 +353,6 @@ class Configuration(BaseModel):
347
353
 
348
354
  class OutputConfiguration(Configuration):
349
355
  input_file_url: Optional[str] = None
350
- # Deprecated
351
- json_schema: Optional[Any] = None
352
- model: Optional[Model] = None
353
- target_chunk_length: Optional[int] = None
354
356
 
355
357
  class Status(str, Enum):
356
358
  STARTING = "Starting"
@@ -2,10 +2,12 @@ import asyncio
2
2
  import functools
3
3
  import httpx
4
4
  import nest_asyncio
5
- from typing import Callable, Any, TypeVar, Awaitable, Union, overload
6
- try:
5
+ from typing import Callable, Any, TypeVar, Awaitable, Union
6
+ import sys
7
+
8
+ if sys.version_info >= (3, 10):
7
9
  from typing import ParamSpec
8
- except ImportError:
10
+ else:
9
11
  from typing_extensions import ParamSpec
10
12
 
11
13
  T = TypeVar('T')
chunkr_ai/api/misc.py CHANGED
@@ -3,7 +3,7 @@ import base64
3
3
  import io
4
4
  from pathlib import Path
5
5
  from PIL import Image
6
- from typing import Union, Tuple, BinaryIO, Optional, Any
6
+ from typing import Union, Tuple, BinaryIO, Optional
7
7
 
8
8
  async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image, bytes, bytearray, memoryview]) -> Tuple[Optional[str], str]:
9
9
  """Convert various file types into a tuple of (filename, file content).
@@ -39,7 +39,7 @@ async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image, bytes, byte
39
39
  base64.b64decode(potential_base64)
40
40
  # If we get here, it was a valid base64 string in bytes form
41
41
  return None, potential_base64
42
- except:
42
+ except Exception:
43
43
  # Not a base64 string in bytes form, encode it as base64
44
44
  base64_str = base64.b64encode(file_bytes).decode()
45
45
  return None, base64_str
@@ -66,14 +66,14 @@ async def prepare_file(file: Union[str, Path, BinaryIO, Image.Image, bytes, byte
66
66
  # Just test if it's valid base64, don't store the result
67
67
  base64.b64decode(file)
68
68
  return None, file
69
- except:
69
+ except Exception:
70
70
  raise ValueError(f"File not found: {file} and it's not a valid base64 string")
71
71
  except Exception as e:
72
72
  # If string can't be converted to Path or decoded as base64, it might still be a base64 string
73
73
  try:
74
- base64.b64decode(file)
75
- return None, file
76
- except:
74
+ base64.b64decode(str(file))
75
+ return None, str(file)
76
+ except Exception:
77
77
  raise ValueError(f"Unable to process file: {e}")
78
78
 
79
79
  # Handle file paths - convert to base64
@@ -1,5 +1,5 @@
1
1
  from datetime import datetime
2
- from typing import Optional, cast, Awaitable, Union
2
+ from typing import Optional, cast, Union
3
3
  from pydantic import BaseModel, PrivateAttr
4
4
  import asyncio
5
5
  import json
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chunkr-ai
3
- Version: 0.3.1
3
+ Version: 0.3.3
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  License: MIT License
@@ -0,0 +1,16 @@
1
+ chunkr_ai/__init__.py,sha256=xkXAzwvm1cFfrdJOOoZ2w9yBoudz_H6OF2HJimOma5I,409
2
+ chunkr_ai/models.py,sha256=NvFJOpsgzEyYHhE-flp7Yr9tpTDvFmF4T87jttFRquU,1202
3
+ chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ chunkr_ai/api/auth.py,sha256=eb0XykbDvWgNWCkeI19XInDNnuU06s_Y-_KC89LhWH0,1408
5
+ chunkr_ai/api/chunkr.py,sha256=3QAlZeq8zbiHp1HxgWpBBUAmvjabD9iBZxIkuGVsKJk,3822
6
+ chunkr_ai/api/chunkr_base.py,sha256=8roSPoCADmaXM2r7zz2iHfZzIcY9NopOfa4j-dfk8RA,6310
7
+ chunkr_ai/api/configuration.py,sha256=YDSN-hv5VyffKxDJBUaVE3u27BsDafxGNGfOBVRmRUk,11682
8
+ chunkr_ai/api/decorators.py,sha256=B-neL5d4N-skq2rjnOfaCVSTz6HEye6udcykacbv7G4,4399
9
+ chunkr_ai/api/misc.py,sha256=pNjbiD5reMdDSkjNTWHn0VgTVsGYn0fl751WuRtSkL8,5389
10
+ chunkr_ai/api/protocol.py,sha256=LjPrYSq52m1afIlAo0yVGXlGZxPRh8J6g7S4PAit3Zo,388
11
+ chunkr_ai/api/task_response.py,sha256=omnKkACjN3ijnbG6UncHLI_IDbsaJ1wHu1g4X9JbijU,8017
12
+ chunkr_ai-0.3.3.dist-info/licenses/LICENSE,sha256=w3R12yNDyZpMiy2lxy_hvNbsldC75ww79sF0u11rkho,1069
13
+ chunkr_ai-0.3.3.dist-info/METADATA,sha256=eQJqOqhXUch-11cqAC4dkEUc2ZkQxAe91vlYzbCEFkE,7086
14
+ chunkr_ai-0.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
+ chunkr_ai-0.3.3.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
16
+ chunkr_ai-0.3.3.dist-info/RECORD,,
@@ -1,16 +0,0 @@
1
- chunkr_ai/__init__.py,sha256=6KpYv2lmD6S5z2kc9pqwuLP5VDHmOuu2qDZArUIhb1s,53
2
- chunkr_ai/models.py,sha256=NvFJOpsgzEyYHhE-flp7Yr9tpTDvFmF4T87jttFRquU,1202
3
- chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- chunkr_ai/api/auth.py,sha256=0RSNFPvHt4Nrg8qtP2xvA2KbR0J_KUe1B_tKynbq9Fc,436
5
- chunkr_ai/api/chunkr.py,sha256=uSNYtB_mcs4-QRKsX7wZb8yv6ayXgRrJSDNZ-EbAyvc,3857
6
- chunkr_ai/api/chunkr_base.py,sha256=8roSPoCADmaXM2r7zz2iHfZzIcY9NopOfa4j-dfk8RA,6310
7
- chunkr_ai/api/configuration.py,sha256=y_jd3K5GB-P8N3uym4wqHDVq-Rq-VT_bhqJqgKs0PVg,11586
8
- chunkr_ai/api/decorators.py,sha256=w1l_ZEkl99C-BO3qRTbi74sYwHDFspB1Bjt1Arv9lPc,4384
9
- chunkr_ai/api/misc.py,sha256=AaGLxZlMzNgVPwErskDRKc2UVGkC0JwxLXU-enPwzA0,5354
10
- chunkr_ai/api/protocol.py,sha256=LjPrYSq52m1afIlAo0yVGXlGZxPRh8J6g7S4PAit3Zo,388
11
- chunkr_ai/api/task_response.py,sha256=VYa62E08VlZUyjn2YslnY4cohdK9e53HbEzsaYIXKXM,8028
12
- chunkr_ai-0.3.1.dist-info/licenses/LICENSE,sha256=w3R12yNDyZpMiy2lxy_hvNbsldC75ww79sF0u11rkho,1069
13
- chunkr_ai-0.3.1.dist-info/METADATA,sha256=_Lg59OcvE1hpsbc3zg20yQFGQ2bpAqOXSx_o6_1UlzY,7086
14
- chunkr_ai-0.3.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
- chunkr_ai-0.3.1.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
16
- chunkr_ai-0.3.1.dist-info/RECORD,,