chunkr-ai 0.0.6__tar.gz → 0.0.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {chunkr_ai-0.0.6/src/chunkr_ai.egg-info → chunkr_ai-0.0.8}/PKG-INFO +1 -1
  2. {chunkr_ai-0.0.6 → chunkr_ai-0.0.8}/pyproject.toml +1 -1
  3. chunkr_ai-0.0.8/src/chunkr_ai/api/base.py +85 -0
  4. {chunkr_ai-0.0.6 → chunkr_ai-0.0.8}/src/chunkr_ai/api/chunkr.py +63 -4
  5. {chunkr_ai-0.0.6 → chunkr_ai-0.0.8}/src/chunkr_ai/api/chunkr_async.py +43 -4
  6. {chunkr_ai-0.0.6 → chunkr_ai-0.0.8}/src/chunkr_ai/api/config.py +24 -24
  7. chunkr_ai-0.0.8/src/chunkr_ai/api/misc.py +106 -0
  8. {chunkr_ai-0.0.6 → chunkr_ai-0.0.8}/src/chunkr_ai/api/task.py +65 -11
  9. chunkr_ai-0.0.8/src/chunkr_ai/main.py +12 -0
  10. {chunkr_ai-0.0.6 → chunkr_ai-0.0.8}/src/chunkr_ai/models.py +0 -1
  11. {chunkr_ai-0.0.6 → chunkr_ai-0.0.8/src/chunkr_ai.egg-info}/PKG-INFO +1 -1
  12. {chunkr_ai-0.0.6 → chunkr_ai-0.0.8}/src/chunkr_ai.egg-info/SOURCES.txt +1 -0
  13. {chunkr_ai-0.0.6 → chunkr_ai-0.0.8}/tests/test_chunkr.py +119 -2
  14. chunkr_ai-0.0.6/src/chunkr_ai/api/base.py +0 -173
  15. chunkr_ai-0.0.6/src/chunkr_ai/main.py +0 -0
  16. {chunkr_ai-0.0.6 → chunkr_ai-0.0.8}/LICENSE +0 -0
  17. {chunkr_ai-0.0.6 → chunkr_ai-0.0.8}/README.md +0 -0
  18. {chunkr_ai-0.0.6 → chunkr_ai-0.0.8}/setup.cfg +0 -0
  19. {chunkr_ai-0.0.6 → chunkr_ai-0.0.8}/src/chunkr_ai/__init__.py +0 -0
  20. {chunkr_ai-0.0.6 → chunkr_ai-0.0.8}/src/chunkr_ai/api/__init__.py +0 -0
  21. {chunkr_ai-0.0.6 → chunkr_ai-0.0.8}/src/chunkr_ai/api/api.py +0 -0
  22. {chunkr_ai-0.0.6 → chunkr_ai-0.0.8}/src/chunkr_ai/api/auth.py +0 -0
  23. {chunkr_ai-0.0.6 → chunkr_ai-0.0.8}/src/chunkr_ai/api/protocol.py +0 -0
  24. {chunkr_ai-0.0.6 → chunkr_ai-0.0.8}/src/chunkr_ai.egg-info/dependency_links.txt +0 -0
  25. {chunkr_ai-0.0.6 → chunkr_ai-0.0.8}/src/chunkr_ai.egg-info/requires.txt +0 -0
  26. {chunkr_ai-0.0.6 → chunkr_ai-0.0.8}/src/chunkr_ai.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: chunkr-ai
3
- Version: 0.0.6
3
+ Version: 0.0.8
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  Project-URL: Homepage, https://chunkr.ai
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "chunkr-ai"
7
- version = "0.0.6"
7
+ version = "0.0.8"
8
8
  authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
9
9
  description = "Python client for Chunkr: open source document intelligence"
10
10
  readme = "README.md"
@@ -0,0 +1,85 @@
1
+ from .config import Configuration
2
+ from .task import TaskResponse
3
+ from .auth import HeadersMixin
4
+ from abc import abstractmethod
5
+ from dotenv import load_dotenv
6
+ import os
7
+ from pathlib import Path
8
+ from PIL import Image
9
+ from typing import BinaryIO, Union
10
+
11
+ class ChunkrBase(HeadersMixin):
12
+ """Base class with shared functionality for Chunkr API clients."""
13
+
14
+ def __init__(self, url: str = None, api_key: str = None):
15
+ load_dotenv()
16
+ self.url = (
17
+ url or
18
+ os.getenv('CHUNKR_URL') or
19
+ 'https://api.chunkr.ai'
20
+ )
21
+ self._api_key = (
22
+ api_key or
23
+ os.getenv('CHUNKR_API_KEY')
24
+ )
25
+ if not self._api_key:
26
+ raise ValueError("API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai")
27
+
28
+ self.url = self.url.rstrip("/")
29
+
30
+ @abstractmethod
31
+ def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
32
+ """Upload a file and wait for processing to complete.
33
+
34
+ Must be implemented by subclasses.
35
+ """
36
+ pass
37
+
38
+ @abstractmethod
39
+ def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
40
+ """Update a task by its ID.
41
+
42
+ Must be implemented by subclasses.
43
+ """
44
+ pass
45
+
46
+ @abstractmethod
47
+ def create_task(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
48
+ """Upload a file for processing and immediately return the task response.
49
+
50
+ Must be implemented by subclasses.
51
+ """
52
+ pass
53
+
54
+ @abstractmethod
55
+ def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
56
+ """Update a task by its ID.
57
+
58
+ Must be implemented by subclasses.
59
+ """
60
+ pass
61
+
62
+ @abstractmethod
63
+ def get_task(self, task_id: str) -> TaskResponse:
64
+ """Get a task response by its ID.
65
+
66
+ Must be implemented by subclasses.
67
+ """
68
+ pass
69
+
70
+ @abstractmethod
71
+ def delete_task(self, task_id: str) -> None:
72
+ """Delete a task by its ID.
73
+
74
+ Must be implemented by subclasses.
75
+ """
76
+ pass
77
+
78
+ @abstractmethod
79
+ def cancel_task(self, task_id: str) -> None:
80
+ """Cancel a task by its ID.
81
+
82
+ Must be implemented by subclasses.
83
+ """
84
+ pass
85
+
@@ -5,6 +5,7 @@ from pathlib import Path
5
5
  from PIL import Image
6
6
  import requests
7
7
  from typing import Union, BinaryIO
8
+ from .misc import prepare_upload_data
8
9
 
9
10
  class Chunkr(ChunkrBase):
10
11
  """Chunkr API client"""
@@ -43,10 +44,23 @@ class Chunkr(ChunkrBase):
43
44
  Returns:
44
45
  TaskResponse: The completed task response
45
46
  """
46
- task = self.start_upload(file, config)
47
+ task = self.create_task(file, config)
47
48
  return task.poll()
49
+
50
+ def update(self, task_id: str, config: Configuration) -> TaskResponse:
51
+ """Update a task by its ID and wait for processing to complete.
52
+
53
+ Args:
54
+ task_id: The ID of the task to update
55
+ config: Configuration options for processing. Optional.
48
56
 
49
- def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
57
+ Returns:
58
+ TaskResponse: The updated task response
59
+ """
60
+ task = self.update_task(task_id, config)
61
+ return task.poll()
62
+
63
+ def create_task(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
50
64
  """Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`
51
65
 
52
66
  Args:
@@ -80,16 +94,35 @@ class Chunkr(ChunkrBase):
80
94
  Returns:
81
95
  TaskResponse: The initial task response
82
96
  """
83
- files, data = self._prepare_upload_data(file, config)
97
+ files= prepare_upload_data(file, config)
84
98
  r = self._session.post(
85
99
  f"{self.url}/api/v1/task",
86
100
  files=files,
87
- data=data,
88
101
  headers=self._headers()
89
102
  )
90
103
  r.raise_for_status()
91
104
  return TaskResponse(**r.json()).with_client(self)
105
+
106
+ def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
107
+ """Update a task by its ID.
108
+
109
+ Args:
110
+ task_id: The ID of the task to update
111
+ config: The new configuration to use
92
112
 
113
+ Returns:
114
+ TaskResponse: The updated task response
115
+ """
116
+ files = prepare_upload_data(None, config)
117
+ r = self._session.patch(
118
+ f"{self.url}/api/v1/task/{task_id}",
119
+ files=files,
120
+ headers=self._headers()
121
+ )
122
+
123
+ r.raise_for_status()
124
+ return TaskResponse(**r.json()).with_client(self)
125
+
93
126
  def get_task(self, task_id: str) -> TaskResponse:
94
127
  """Get a task response by its ID.
95
128
 
@@ -106,3 +139,29 @@ class Chunkr(ChunkrBase):
106
139
  r.raise_for_status()
107
140
  return TaskResponse(**r.json()).with_client(self)
108
141
 
142
+
143
+ def delete_task(self, task_id: str) -> None:
144
+ """Delete a task by its ID.
145
+
146
+ Args:
147
+ task_id: The ID of the task to delete
148
+ """
149
+ r = self._session.delete(
150
+ f"{self.url}/api/v1/task/{task_id}",
151
+ headers=self._headers()
152
+ )
153
+ r.raise_for_status()
154
+
155
+ def cancel_task(self, task_id: str) -> None:
156
+ """Cancel a task by its ID.
157
+
158
+ Args:
159
+ task_id: The ID of the task to cancel
160
+ """
161
+ r = self._session.get(
162
+ f"{self.url}/api/v1/task/{task_id}/cancel",
163
+ headers=self._headers()
164
+ )
165
+ r.raise_for_status()
166
+
167
+
@@ -5,6 +5,7 @@ import httpx
5
5
  from pathlib import Path
6
6
  from PIL import Image
7
7
  from typing import Union, BinaryIO
8
+ from .misc import prepare_upload_data
8
9
 
9
10
  class ChunkrAsync(ChunkrBase):
10
11
  """Asynchronous Chunkr API client"""
@@ -43,10 +44,23 @@ class ChunkrAsync(ChunkrBase):
43
44
  Returns:
44
45
  TaskResponse: The completed task response
45
46
  """
46
- task = await self.start_upload(file, config)
47
+ task = await self.create_task(file, config)
47
48
  return await task.poll_async()
49
+
50
+ async def update(self, task_id: str, config: Configuration) -> TaskResponse:
51
+ """Update a task by its ID and wait for processing to complete.
52
+
53
+ Args:
54
+ task_id: The ID of the task to update
55
+ config: Configuration options for processing. Optional.
48
56
 
49
- async def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
57
+ Returns:
58
+ TaskResponse: The updated task response
59
+ """
60
+ task = await self.update_task(task_id, config)
61
+ return await task.poll_async()
62
+
63
+ async def create_task(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
50
64
  """Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll_async()`.
51
65
 
52
66
  Args:
@@ -80,16 +94,26 @@ class ChunkrAsync(ChunkrBase):
80
94
  Returns:
81
95
  TaskResponse: The initial task response
82
96
  """
83
- files, data = self._prepare_upload_data(file, config)
97
+ files = prepare_upload_data(file, config)
84
98
  r = await self._client.post(
85
99
  f"{self.url}/api/v1/task",
86
100
  files=files,
87
- json=config.model_dump() if config else {},
88
101
  headers=self._headers()
89
102
  )
90
103
  r.raise_for_status()
91
104
  return TaskResponse(**r.json()).with_client(self)
92
105
 
106
+ async def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
107
+ files = prepare_upload_data(None, config)
108
+ r = await self._client.patch(
109
+ f"{self.url}/api/v1/task/{task_id}",
110
+ files=files,
111
+ headers=self._headers()
112
+ )
113
+
114
+ r.raise_for_status()
115
+ return TaskResponse(**r.json()).with_client(self)
116
+
93
117
  async def get_task(self, task_id: str) -> TaskResponse:
94
118
  r = await self._client.get(
95
119
  f"{self.url}/api/v1/task/{task_id}",
@@ -97,7 +121,22 @@ class ChunkrAsync(ChunkrBase):
97
121
  )
98
122
  r.raise_for_status()
99
123
  return TaskResponse(**r.json()).with_client(self)
124
+
125
+ async def delete_task(self, task_id: str) -> None:
126
+ r = await self._client.delete(
127
+ f"{self.url}/api/v1/task/{task_id}",
128
+ headers=self._headers()
129
+ )
130
+ r.raise_for_status()
131
+
132
+ async def cancel_task(self, task_id: str) -> None:
133
+ r = await self._client.get(
134
+ f"{self.url}/api/v1/task/{task_id}/cancel",
135
+ headers=self._headers()
136
+ )
137
+ r.raise_for_status()
100
138
 
139
+
101
140
  async def __aenter__(self):
102
141
  return self
103
142
 
@@ -1,4 +1,4 @@
1
- from pydantic import BaseModel, Field, model_validator
1
+ from pydantic import BaseModel, Field, model_validator, ConfigDict
2
2
  from enum import Enum
3
3
  from typing import Optional, List, Dict
4
4
 
@@ -10,30 +10,30 @@ class CroppingStrategy(str, Enum):
10
10
  ALL = "All"
11
11
  AUTO = "Auto"
12
12
 
13
- class LlmConfig(BaseModel):
14
- model: str
15
- prompt: str
16
- temperature: float = 0.0
17
-
18
13
  class GenerationConfig(BaseModel):
19
14
  html: Optional[GenerationStrategy] = None
20
- llm: Optional[LlmConfig] = None
15
+ llm: Optional[str] = None
21
16
  markdown: Optional[GenerationStrategy] = None
22
17
  crop_image: Optional[CroppingStrategy] = None
23
18
 
24
19
  class SegmentProcessing(BaseModel):
25
- title: Optional[GenerationConfig] = None
26
- section_header: Optional[GenerationConfig] = None
27
- text: Optional[GenerationConfig] = None
28
- list_item: Optional[GenerationConfig] = None
29
- table: Optional[GenerationConfig] = None
30
- picture: Optional[GenerationConfig] = None
31
- caption: Optional[GenerationConfig] = None
32
- formula: Optional[GenerationConfig] = None
33
- footnote: Optional[GenerationConfig] = None
34
- page_header: Optional[GenerationConfig] = None
35
- page_footer: Optional[GenerationConfig] = None
36
- page: Optional[GenerationConfig] = None
20
+ model_config = ConfigDict(
21
+ populate_by_name=True,
22
+ alias_generator=str.title
23
+ )
24
+
25
+ title: Optional[GenerationConfig] = Field(default=None, alias="Title")
26
+ section_header: Optional[GenerationConfig] = Field(default=None, alias="SectionHeader")
27
+ text: Optional[GenerationConfig] = Field(default=None, alias="Text")
28
+ list_item: Optional[GenerationConfig] = Field(default=None, alias="ListItem")
29
+ table: Optional[GenerationConfig] = Field(default=None, alias="Table")
30
+ picture: Optional[GenerationConfig] = Field(default=None, alias="Picture")
31
+ caption: Optional[GenerationConfig] = Field(default=None, alias="Caption")
32
+ formula: Optional[GenerationConfig] = Field(default=None, alias="Formula")
33
+ footnote: Optional[GenerationConfig] = Field(default=None, alias="Footnote")
34
+ page_header: Optional[GenerationConfig] = Field(default=None, alias="PageHeader")
35
+ page_footer: Optional[GenerationConfig] = Field(default=None, alias="PageFooter")
36
+ page: Optional[GenerationConfig] = Field(default=None, alias="Page")
37
37
 
38
38
  class ChunkProcessing(BaseModel):
39
39
  target_length: Optional[int] = None
@@ -86,9 +86,9 @@ class Segment(BaseModel):
86
86
  bbox: BoundingBox
87
87
  content: str
88
88
  page_height: float
89
- html: Optional[str]
90
- image: Optional[str]
91
- markdown: Optional[str]
89
+ html: Optional[str] = None
90
+ image: Optional[str] = None
91
+ markdown: Optional[str] = None
92
92
  ocr: List[OCRResult]
93
93
  page_number: int
94
94
  page_width: float
@@ -104,8 +104,8 @@ class ExtractedJson(BaseModel):
104
104
  data: Dict
105
105
 
106
106
  class OutputResponse(BaseModel):
107
- chunks: List[Chunk] = []
108
- extracted_json: Optional[ExtractedJson]
107
+ chunks: List[Chunk]
108
+ extracted_json: Optional[ExtractedJson] = Field(default=None)
109
109
 
110
110
  class Model(str, Enum):
111
111
  FAST = "Fast"
@@ -0,0 +1,106 @@
1
+ import io
2
+ import json
3
+ from pathlib import Path
4
+ from PIL import Image
5
+ import requests
6
+ from typing import Union, Tuple, BinaryIO, Optional
7
+ from .config import Configuration
8
+
9
+
10
+ def prepare_file(
11
+ file: Union[str, Path, BinaryIO, Image.Image]
12
+ ) -> Tuple[str, BinaryIO]:
13
+ """Convert various file types into a tuple of (filename, file-like object)."""
14
+ # Handle URLs
15
+ if isinstance(file, str) and (file.startswith('http://') or file.startswith('https://')):
16
+ response = requests.get(file)
17
+ response.raise_for_status()
18
+ file_obj = io.BytesIO(response.content)
19
+ filename = Path(file.split('/')[-1]).name or 'downloaded_file'
20
+ return filename, file_obj
21
+
22
+ # Handle base64 strings
23
+ if isinstance(file, str) and ',' in file and ';base64,' in file:
24
+ try:
25
+ # Split header and data
26
+ header, base64_data = file.split(',', 1)
27
+ import base64
28
+ file_bytes = base64.b64decode(base64_data)
29
+ file_obj = io.BytesIO(file_bytes)
30
+
31
+ # Try to determine format from header
32
+ format = 'bin'
33
+ mime_type = header.split(':')[-1].split(';')[0].lower()
34
+
35
+ # Map MIME types to file extensions
36
+ mime_to_ext = {
37
+ 'application/pdf': 'pdf',
38
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
39
+ 'application/msword': 'doc',
40
+ 'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
41
+ 'application/vnd.ms-powerpoint': 'ppt',
42
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
43
+ 'application/vnd.ms-excel': 'xls',
44
+ 'image/jpeg': 'jpg',
45
+ 'image/png': 'png',
46
+ 'image/jpg': 'jpg'
47
+ }
48
+
49
+ if mime_type in mime_to_ext:
50
+ format = mime_to_ext[mime_type]
51
+ else:
52
+ raise ValueError(f"Unsupported MIME type: {mime_type}")
53
+
54
+ return f"file.{format}", file_obj
55
+ except Exception as e:
56
+ raise ValueError(f"Invalid base64 string: {str(e)}")
57
+
58
+ # Handle file paths
59
+ if isinstance(file, (str, Path)):
60
+ path = Path(file).resolve()
61
+ if not path.exists():
62
+ raise FileNotFoundError(f"File not found: {file}")
63
+ return path.name, open(path, 'rb')
64
+
65
+ # Handle PIL Images
66
+ if isinstance(file, Image.Image):
67
+ img_byte_arr = io.BytesIO()
68
+ format = file.format or 'PNG'
69
+ file.save(img_byte_arr, format=format)
70
+ img_byte_arr.seek(0)
71
+ return f"image.{format.lower()}", img_byte_arr
72
+
73
+ # Handle file-like objects
74
+ if hasattr(file, 'read') and hasattr(file, 'seek'):
75
+ # Try to get the filename from the file object if possible
76
+ name = getattr(file, 'name', 'document') if hasattr(file, 'name') else 'document'
77
+ return Path(name).name, file
78
+
79
+ raise TypeError(f"Unsupported file type: {type(file)}")
80
+
81
+
82
+
83
+ def prepare_upload_data(
84
+ file: Optional[Union[str, Path, BinaryIO, Image.Image]] = None,
85
+ config: Optional[Configuration] = None
86
+ ) -> dict:
87
+ """Prepare files and data dictionaries for upload.
88
+
89
+ Args:
90
+ file: The file to upload
91
+ config: Optional configuration settings
92
+
93
+ Returns:
94
+ dict: (files dict) ready for upload
95
+ """
96
+ files = {}
97
+ if file:
98
+ filename, file_obj = prepare_file(file)
99
+ files = {"file": (filename, file_obj)}
100
+
101
+ if config:
102
+ config_dict = config.model_dump(mode="json", exclude_none=True)
103
+ for key, value in config_dict.items():
104
+ files[key] = (None, json.dumps(value), 'application/json')
105
+
106
+ return files
@@ -1,5 +1,6 @@
1
1
  from .protocol import ChunkrClientProtocol
2
2
  from .config import Configuration, OutputResponse
3
+ from .misc import prepare_upload_data
3
4
  import asyncio
4
5
  from datetime import datetime
5
6
  from enum import Enum
@@ -12,22 +13,23 @@ class Status(str, Enum):
12
13
  PROCESSING = "Processing"
13
14
  SUCCEEDED = "Succeeded"
14
15
  FAILED = "Failed"
16
+ CANCELLED = "Cancelled"
15
17
 
16
18
  class TaskResponse(BaseModel):
17
19
  configuration: Configuration
18
20
  created_at: datetime
19
- expires_at: Optional[datetime]
20
- file_name: Optional[str]
21
- finished_at: Optional[datetime]
22
- input_file_url: Optional[str]
21
+ expires_at: Optional[datetime] = None
22
+ file_name: Optional[str] = None
23
+ finished_at: Optional[datetime] = None
24
+ input_file_url: Optional[str] = None
23
25
  message: str
24
- output: Optional[OutputResponse]
25
- page_count: Optional[int]
26
- pdf_url: Optional[str]
27
- started_at: Optional[datetime]
26
+ output: Optional[OutputResponse] = None
27
+ page_count: Optional[int] = None
28
+ pdf_url: Optional[str] = None
29
+ started_at: Optional[datetime] = None
28
30
  status: Status
29
31
  task_id: str
30
- task_url: Optional[str]
32
+ task_url: Optional[str] = None
31
33
  _client: Optional[Union[ChunkrClientProtocol]] = PrivateAttr(default=None)
32
34
 
33
35
  def with_client(self, client: Union[ChunkrClientProtocol]) -> 'TaskResponse':
@@ -79,7 +81,8 @@ class TaskResponse(BaseModel):
79
81
  """Poll the task for completion."""
80
82
  while True:
81
83
  response = self._poll_request_sync()
82
- self.__dict__.update(response)
84
+ updated_task = TaskResponse(**response).with_client(self._client)
85
+ self.__dict__.update(updated_task.__dict__)
83
86
 
84
87
  if result := self._check_status():
85
88
  return result
@@ -90,7 +93,8 @@ class TaskResponse(BaseModel):
90
93
  """Poll the task for completion asynchronously."""
91
94
  while True:
92
95
  response = await self._poll_request_async()
93
- self.__dict__.update(response)
96
+ updated_task = TaskResponse(**response).with_client(self._client)
97
+ self.__dict__.update(updated_task.__dict__)
94
98
 
95
99
  if result := self._check_status():
96
100
  return result
@@ -108,6 +112,56 @@ class TaskResponse(BaseModel):
108
112
  if content:
109
113
  parts.append(content)
110
114
  return "\n".join(parts)
115
+
116
+ def update(self, config: Configuration) -> 'TaskResponse':
117
+ files = prepare_upload_data(None, config)
118
+ r = self._client._session.patch(
119
+ f"{self.task_url}",
120
+ files=files,
121
+ headers=self._client._headers()
122
+ )
123
+ r.raise_for_status()
124
+ return TaskResponse(**r.json()).with_client(self._client)
125
+
126
+ async def update_async(self, config: Configuration) -> 'TaskResponse':
127
+ files = prepare_upload_data(None, config)
128
+ r = await self._client._client.patch(
129
+ f"{self.task_url}",
130
+ files=files,
131
+ headers=self._client._headers()
132
+ )
133
+ r.raise_for_status()
134
+ return TaskResponse(**r.json()).with_client(self._client)
135
+
136
+ def cancel(self):
137
+ r = self._client._session.get(
138
+ f"{self.task_url}/cancel",
139
+ headers=self._client._headers()
140
+ )
141
+ r.raise_for_status()
142
+ self.poll()
143
+
144
+ async def cancel_async(self):
145
+ r = await self._client._client.get(
146
+ f"{self.task_url}/cancel",
147
+ headers=self._client._headers()
148
+ )
149
+ r.raise_for_status()
150
+ await self.poll_async()
151
+
152
+ def delete(self):
153
+ r = self._client._session.delete(
154
+ f"{self.task_url}",
155
+ headers=self._client._headers()
156
+ )
157
+ r.raise_for_status()
158
+
159
+ async def delete_async(self):
160
+ r = await self._client._client.delete(
161
+ f"{self.task_url}",
162
+ headers=self._client._headers()
163
+ )
164
+ r.raise_for_status()
111
165
 
112
166
  def html(self) -> str:
113
167
  """Get full HTML for the task"""
@@ -0,0 +1,12 @@
1
+ from chunkr_ai.api.chunkr import Chunkr
2
+ from chunkr_ai.models import Configuration
3
+ from chunkr_ai.api.config import SegmentationStrategy, ChunkProcessing
4
+
5
+ if __name__ == "__main__":
6
+ chunkr = Chunkr()
7
+ task = chunkr.update_task("556b4fe5-e3f7-48dc-9f56-0fb7fbacdb87", Configuration(
8
+ chunk_processing=ChunkProcessing(
9
+ target_length=1000
10
+ )
11
+ ))
12
+ print(task)
@@ -8,7 +8,6 @@ from .api.config import (
8
8
  GenerationStrategy,
9
9
  GenerationConfig,
10
10
  JsonSchema,
11
- LlmConfig,
12
11
  Model,
13
12
  OCRResult,
14
13
  OcrStrategy,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: chunkr-ai
3
- Version: 0.0.6
3
+ Version: 0.0.8
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  Project-URL: Homepage, https://chunkr.ai
@@ -16,6 +16,7 @@ src/chunkr_ai/api/base.py
16
16
  src/chunkr_ai/api/chunkr.py
17
17
  src/chunkr_ai/api/chunkr_async.py
18
18
  src/chunkr_ai/api/config.py
19
+ src/chunkr_ai/api/misc.py
19
20
  src/chunkr_ai/api/protocol.py
20
21
  src/chunkr_ai/api/task.py
21
22
  tests/test_chunkr.py
@@ -19,7 +19,7 @@ from chunkr_ai.models import (
19
19
 
20
20
  @pytest.fixture(params=[
21
21
  pytest.param(("sync", Chunkr()), id="sync"),
22
- pytest.param(("async", ChunkrAsync()), id="async")
22
+ # pytest.param(("async", ChunkrAsync()), id="async")
23
23
  ])
24
24
  def chunkr_client(request):
25
25
  return request.param
@@ -209,4 +209,121 @@ async def test_json_schema(chunkr_client, sample_path):
209
209
  assert response.status == "Succeeded"
210
210
  assert response.output is not None
211
211
 
212
-
212
+ @pytest.mark.asyncio
213
+ async def test_delete_task(chunkr_client, sample_path):
214
+ client_type, client = chunkr_client
215
+ response = await client.upload(sample_path) if client_type == "async" else client.upload(sample_path)
216
+ assert isinstance(response, TaskResponse)
217
+ assert response.task_id is not None
218
+ assert response.status == "Succeeded"
219
+ assert response.output is not None
220
+
221
+ if client_type == "async":
222
+ await client.delete_task(response.task_id)
223
+ with pytest.raises(Exception):
224
+ await client.get_task(response.task_id)
225
+ else:
226
+ client.delete_task(response.task_id)
227
+ with pytest.raises(Exception):
228
+ client.get_task(response.task_id)
229
+
230
+ @pytest.mark.asyncio
231
+ async def test_delete_task_direct(chunkr_client, sample_path):
232
+ client_type, client = chunkr_client
233
+ task = await client.upload(sample_path) if client_type == "async" else client.upload(sample_path)
234
+ assert isinstance(task, TaskResponse)
235
+ assert task.task_id is not None
236
+ assert task.status == "Succeeded"
237
+ assert task.output is not None
238
+
239
+ if client_type == "async":
240
+ await client.delete_task(task.task_id)
241
+ with pytest.raises(Exception):
242
+ await client.get_task(task.task_id)
243
+ else:
244
+ client.delete_task(task.task_id)
245
+ with pytest.raises(Exception):
246
+ client.get_task(task.task_id)
247
+
248
+ @pytest.mark.asyncio
249
+ async def test_cancel_task(chunkr_client, sample_path):
250
+ client_type, client = chunkr_client
251
+ response = await client.create_task(sample_path) if client_type == "async" else client.create_task(sample_path)
252
+ assert isinstance(response, TaskResponse)
253
+ assert response.task_id is not None
254
+ assert response.status == "Starting"
255
+
256
+ if client_type == "async":
257
+ await client.cancel_task(response.task_id)
258
+ assert (await client.get_task(response.task_id)).status == "Cancelled"
259
+ await response.poll_async()
260
+ else:
261
+ client.cancel_task(response.task_id)
262
+ assert client.get_task(response.task_id).status == "Cancelled"
263
+ response.poll()
264
+
265
+ assert response.output is None
266
+
267
+ @pytest.mark.asyncio
268
+ async def test_cancel_task_direct(chunkr_client, sample_path):
269
+ client_type, client = chunkr_client
270
+ task = await client.create_task(sample_path) if client_type == "async" else client.create_task(sample_path)
271
+ assert isinstance(task, TaskResponse)
272
+ assert task.task_id is not None
273
+ assert task.status == "Starting"
274
+
275
+ if client_type == "async":
276
+ await task.cancel_async()
277
+ else:
278
+ task.cancel()
279
+
280
+ assert task.status == "Cancelled"
281
+ assert task.output is None
282
+
283
+ @pytest.mark.asyncio
284
+ async def test_update_task(chunkr_client, sample_path):
285
+ client_type, client = chunkr_client
286
+ original_config = Configuration(
287
+ segmentation_strategy=SegmentationStrategy.LAYOUT_ANALYSIS,
288
+ )
289
+ new_config = Configuration(
290
+ segmentation_strategy=SegmentationStrategy.PAGE,
291
+ )
292
+ response = await client.upload(sample_path, original_config) if client_type == "async" else client.upload(sample_path, original_config)
293
+ assert isinstance(response, TaskResponse)
294
+ assert response.task_id is not None
295
+ assert response.status == "Succeeded"
296
+ assert response.output is not None
297
+
298
+ if client_type == "async":
299
+ task = await client.update(response.task_id, new_config)
300
+ else:
301
+ task = client.update(response.task_id, new_config)
302
+
303
+ assert task.status == "Succeeded"
304
+ assert task.output is not None
305
+ assert task.configuration.segmentation_strategy == SegmentationStrategy.PAGE
306
+
307
+ @pytest.mark.asyncio
308
+ async def test_update_task_direct(chunkr_client, sample_path):
309
+ client_type, client = chunkr_client
310
+ original_config = Configuration(
311
+ segmentation_strategy=SegmentationStrategy.LAYOUT_ANALYSIS,
312
+ )
313
+ new_config = Configuration(
314
+ segmentation_strategy=SegmentationStrategy.PAGE,
315
+ )
316
+ task = await client.upload(sample_path, original_config) if client_type == "async" else client.upload(sample_path, original_config)
317
+ assert isinstance(task, TaskResponse)
318
+ assert task.task_id is not None
319
+ assert task.status == "Succeeded"
320
+ assert task.output is not None
321
+
322
+ if client_type == "async":
323
+ await task.update_async(new_config)
324
+ else:
325
+ task.update(new_config)
326
+
327
+ assert task.status == "Succeeded"
328
+ assert task.output is not None
329
+ assert task.configuration.segmentation_strategy == SegmentationStrategy.PAGE
@@ -1,173 +0,0 @@
1
- from .config import Configuration
2
- from .task import TaskResponse
3
- from .auth import HeadersMixin
4
- from abc import abstractmethod
5
- from dotenv import load_dotenv
6
- import io
7
- import json
8
- import os
9
- from pathlib import Path
10
- from PIL import Image
11
- import requests
12
- from typing import BinaryIO, Tuple, Union
13
-
14
- class ChunkrBase(HeadersMixin):
15
- """Base class with shared functionality for Chunkr API clients."""
16
-
17
- def __init__(self, url: str = None, api_key: str = None):
18
- load_dotenv()
19
- self.url = (
20
- url or
21
- os.getenv('CHUNKR_URL') or
22
- 'https://api.chunkr.ai'
23
- )
24
- self._api_key = (
25
- api_key or
26
- os.getenv('CHUNKR_API_KEY')
27
- )
28
- if not self._api_key:
29
- raise ValueError("API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai")
30
-
31
- self.url = self.url.rstrip("/")
32
-
33
- def _prepare_file(
34
- self,
35
- file: Union[str, Path, BinaryIO, Image.Image]
36
- ) -> Tuple[str, BinaryIO]:
37
- """Convert various file types into a tuple of (filename, file-like object).
38
-
39
- Args:
40
- file: Input file, can be:
41
- - String or Path to a file
42
- - URL string starting with http:// or https://
43
- - Base64 string
44
- - Opened binary file (mode='rb')
45
- - PIL/Pillow Image object
46
-
47
- Returns:
48
- Tuple[str, BinaryIO]: (filename, file-like object) ready for upload
49
-
50
- Raises:
51
- FileNotFoundError: If the file path doesn't exist
52
- TypeError: If the file type is not supported
53
- ValueError: If the URL is invalid or unreachable
54
- ValueError: If the MIME type is unsupported
55
- """
56
- # Handle URLs
57
- if isinstance(file, str) and (file.startswith('http://') or file.startswith('https://')):
58
- response = requests.get(file)
59
- response.raise_for_status()
60
- file_obj = io.BytesIO(response.content)
61
- filename = Path(file.split('/')[-1]).name or 'downloaded_file'
62
- return filename, file_obj
63
-
64
- # Handle base64 strings
65
- if isinstance(file, str) and ',' in file and ';base64,' in file:
66
- try:
67
- # Split header and data
68
- header, base64_data = file.split(',', 1)
69
- import base64
70
- file_bytes = base64.b64decode(base64_data)
71
- file_obj = io.BytesIO(file_bytes)
72
-
73
- # Try to determine format from header
74
- format = 'bin'
75
- mime_type = header.split(':')[-1].split(';')[0].lower()
76
-
77
- # Map MIME types to file extensions
78
- mime_to_ext = {
79
- 'application/pdf': 'pdf',
80
- 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
81
- 'application/msword': 'doc',
82
- 'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
83
- 'application/vnd.ms-powerpoint': 'ppt',
84
- 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
85
- 'application/vnd.ms-excel': 'xls',
86
- 'image/jpeg': 'jpg',
87
- 'image/png': 'png',
88
- 'image/jpg': 'jpg'
89
- }
90
-
91
- if mime_type in mime_to_ext:
92
- format = mime_to_ext[mime_type]
93
- else:
94
- raise ValueError(f"Unsupported MIME type: {mime_type}")
95
-
96
- return f"file.{format}", file_obj
97
- except Exception as e:
98
- raise ValueError(f"Invalid base64 string: {str(e)}")
99
-
100
- # Handle file paths
101
- if isinstance(file, (str, Path)):
102
- path = Path(file).resolve()
103
- if not path.exists():
104
- raise FileNotFoundError(f"File not found: {file}")
105
- return path.name, open(path, 'rb')
106
-
107
- # Handle PIL Images
108
- if isinstance(file, Image.Image):
109
- img_byte_arr = io.BytesIO()
110
- format = file.format or 'PNG'
111
- file.save(img_byte_arr, format=format)
112
- img_byte_arr.seek(0)
113
- return f"image.{format.lower()}", img_byte_arr
114
-
115
- # Handle file-like objects
116
- if hasattr(file, 'read') and hasattr(file, 'seek'):
117
- # Try to get the filename from the file object if possible
118
- name = getattr(file, 'name', 'document') if hasattr(file, 'name') else 'document'
119
- return Path(name).name, file
120
-
121
- raise TypeError(f"Unsupported file type: {type(file)}")
122
-
123
- def _prepare_upload_data(
124
- self,
125
- file: Union[str, Path, BinaryIO, Image.Image],
126
- config: Configuration = None
127
- ) -> Tuple[dict, dict]:
128
- """Prepare files and data dictionaries for upload.
129
-
130
- Args:
131
- file: The file to upload
132
- config: Optional configuration settings
133
-
134
- Returns:
135
- Tuple[dict, dict]: (files dict, data dict) ready for upload
136
- """
137
- filename, file_obj = self._prepare_file(file)
138
- files = {"file": (filename, file_obj)}
139
- data = {}
140
-
141
- if config:
142
- config_dict = config.model_dump(mode="json", exclude_none=True)
143
- for key, value in config_dict.items():
144
- if isinstance(value, dict):
145
- files[key] = (None, json.dumps(value), 'application/json')
146
- else:
147
- data[key] = value
148
-
149
- return files, data
150
-
151
- @abstractmethod
152
- def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
153
- """Upload a file and wait for processing to complete.
154
-
155
- Must be implemented by subclasses.
156
- """
157
- pass
158
-
159
- @abstractmethod
160
- def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
161
- """Upload a file for processing and immediately return the task response.
162
-
163
- Must be implemented by subclasses.
164
- """
165
- pass
166
-
167
- @abstractmethod
168
- def get_task(self, task_id: str) -> TaskResponse:
169
- """Get a task response by its ID.
170
-
171
- Must be implemented by subclasses.
172
- """
173
- pass
File without changes
File without changes
File without changes
File without changes