chunkr-ai 0.0.6__tar.gz → 0.0.7__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (26) hide show
  1. {chunkr_ai-0.0.6/src/chunkr_ai.egg-info → chunkr_ai-0.0.7}/PKG-INFO +1 -1
  2. {chunkr_ai-0.0.6 → chunkr_ai-0.0.7}/pyproject.toml +1 -1
  3. chunkr_ai-0.0.7/src/chunkr_ai/api/base.py +85 -0
  4. {chunkr_ai-0.0.6 → chunkr_ai-0.0.7}/src/chunkr_ai/api/chunkr.py +63 -4
  5. {chunkr_ai-0.0.6 → chunkr_ai-0.0.7}/src/chunkr_ai/api/chunkr_async.py +43 -4
  6. {chunkr_ai-0.0.6 → chunkr_ai-0.0.7}/src/chunkr_ai/api/config.py +19 -19
  7. chunkr_ai-0.0.7/src/chunkr_ai/api/misc.py +106 -0
  8. {chunkr_ai-0.0.6 → chunkr_ai-0.0.7}/src/chunkr_ai/api/task.py +56 -2
  9. chunkr_ai-0.0.7/src/chunkr_ai/main.py +12 -0
  10. {chunkr_ai-0.0.6 → chunkr_ai-0.0.7}/src/chunkr_ai/models.py +0 -1
  11. {chunkr_ai-0.0.6 → chunkr_ai-0.0.7/src/chunkr_ai.egg-info}/PKG-INFO +1 -1
  12. {chunkr_ai-0.0.6 → chunkr_ai-0.0.7}/src/chunkr_ai.egg-info/SOURCES.txt +1 -0
  13. {chunkr_ai-0.0.6 → chunkr_ai-0.0.7}/tests/test_chunkr.py +119 -2
  14. chunkr_ai-0.0.6/src/chunkr_ai/api/base.py +0 -173
  15. chunkr_ai-0.0.6/src/chunkr_ai/main.py +0 -0
  16. {chunkr_ai-0.0.6 → chunkr_ai-0.0.7}/LICENSE +0 -0
  17. {chunkr_ai-0.0.6 → chunkr_ai-0.0.7}/README.md +0 -0
  18. {chunkr_ai-0.0.6 → chunkr_ai-0.0.7}/setup.cfg +0 -0
  19. {chunkr_ai-0.0.6 → chunkr_ai-0.0.7}/src/chunkr_ai/__init__.py +0 -0
  20. {chunkr_ai-0.0.6 → chunkr_ai-0.0.7}/src/chunkr_ai/api/__init__.py +0 -0
  21. {chunkr_ai-0.0.6 → chunkr_ai-0.0.7}/src/chunkr_ai/api/api.py +0 -0
  22. {chunkr_ai-0.0.6 → chunkr_ai-0.0.7}/src/chunkr_ai/api/auth.py +0 -0
  23. {chunkr_ai-0.0.6 → chunkr_ai-0.0.7}/src/chunkr_ai/api/protocol.py +0 -0
  24. {chunkr_ai-0.0.6 → chunkr_ai-0.0.7}/src/chunkr_ai.egg-info/dependency_links.txt +0 -0
  25. {chunkr_ai-0.0.6 → chunkr_ai-0.0.7}/src/chunkr_ai.egg-info/requires.txt +0 -0
  26. {chunkr_ai-0.0.6 → chunkr_ai-0.0.7}/src/chunkr_ai.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: chunkr-ai
3
- Version: 0.0.6
3
+ Version: 0.0.7
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  Project-URL: Homepage, https://chunkr.ai
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "chunkr-ai"
7
- version = "0.0.6"
7
+ version = "0.0.7"
8
8
  authors = [{"name" = "Ishaan Kapoor", "email" = "ishaan@lumina.sh"}]
9
9
  description = "Python client for Chunkr: open source document intelligence"
10
10
  readme = "README.md"
@@ -0,0 +1,85 @@
1
+ from .config import Configuration
2
+ from .task import TaskResponse
3
+ from .auth import HeadersMixin
4
+ from abc import abstractmethod
5
+ from dotenv import load_dotenv
6
+ import os
7
+ from pathlib import Path
8
+ from PIL import Image
9
+ from typing import BinaryIO, Union
10
+
11
+ class ChunkrBase(HeadersMixin):
12
+ """Base class with shared functionality for Chunkr API clients."""
13
+
14
+ def __init__(self, url: str = None, api_key: str = None):
15
+ load_dotenv()
16
+ self.url = (
17
+ url or
18
+ os.getenv('CHUNKR_URL') or
19
+ 'https://api.chunkr.ai'
20
+ )
21
+ self._api_key = (
22
+ api_key or
23
+ os.getenv('CHUNKR_API_KEY')
24
+ )
25
+ if not self._api_key:
26
+ raise ValueError("API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai")
27
+
28
+ self.url = self.url.rstrip("/")
29
+
30
+ @abstractmethod
31
+ def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
32
+ """Upload a file and wait for processing to complete.
33
+
34
+ Must be implemented by subclasses.
35
+ """
36
+ pass
37
+
38
+ @abstractmethod
39
+ def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
40
+ """Update a task by its ID.
41
+
42
+ Must be implemented by subclasses.
43
+ """
44
+ pass
45
+
46
+ @abstractmethod
47
+ def create_task(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
48
+ """Upload a file for processing and immediately return the task response.
49
+
50
+ Must be implemented by subclasses.
51
+ """
52
+ pass
53
+
54
+ @abstractmethod
55
+ def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
56
+ """Update a task by its ID.
57
+
58
+ Must be implemented by subclasses.
59
+ """
60
+ pass
61
+
62
+ @abstractmethod
63
+ def get_task(self, task_id: str) -> TaskResponse:
64
+ """Get a task response by its ID.
65
+
66
+ Must be implemented by subclasses.
67
+ """
68
+ pass
69
+
70
+ @abstractmethod
71
+ def delete_task(self, task_id: str) -> None:
72
+ """Delete a task by its ID.
73
+
74
+ Must be implemented by subclasses.
75
+ """
76
+ pass
77
+
78
+ @abstractmethod
79
+ def cancel_task(self, task_id: str) -> None:
80
+ """Cancel a task by its ID.
81
+
82
+ Must be implemented by subclasses.
83
+ """
84
+ pass
85
+
@@ -5,6 +5,7 @@ from pathlib import Path
5
5
  from PIL import Image
6
6
  import requests
7
7
  from typing import Union, BinaryIO
8
+ from .misc import prepare_upload_data
8
9
 
9
10
  class Chunkr(ChunkrBase):
10
11
  """Chunkr API client"""
@@ -43,10 +44,23 @@ class Chunkr(ChunkrBase):
43
44
  Returns:
44
45
  TaskResponse: The completed task response
45
46
  """
46
- task = self.start_upload(file, config)
47
+ task = self.create_task(file, config)
47
48
  return task.poll()
49
+
50
+ def update(self, task_id: str, config: Configuration) -> TaskResponse:
51
+ """Update a task by its ID and wait for processing to complete.
52
+
53
+ Args:
54
+ task_id: The ID of the task to update
55
+ config: Configuration options for processing. Optional.
48
56
 
49
- def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
57
+ Returns:
58
+ TaskResponse: The updated task response
59
+ """
60
+ task = self.update_task(task_id, config)
61
+ return task.poll()
62
+
63
+ def create_task(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
50
64
  """Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`
51
65
 
52
66
  Args:
@@ -80,16 +94,35 @@ class Chunkr(ChunkrBase):
80
94
  Returns:
81
95
  TaskResponse: The initial task response
82
96
  """
83
- files, data = self._prepare_upload_data(file, config)
97
+ files= prepare_upload_data(file, config)
84
98
  r = self._session.post(
85
99
  f"{self.url}/api/v1/task",
86
100
  files=files,
87
- data=data,
88
101
  headers=self._headers()
89
102
  )
90
103
  r.raise_for_status()
91
104
  return TaskResponse(**r.json()).with_client(self)
105
+
106
+ def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
107
+ """Update a task by its ID.
108
+
109
+ Args:
110
+ task_id: The ID of the task to update
111
+ config: The new configuration to use
92
112
 
113
+ Returns:
114
+ TaskResponse: The updated task response
115
+ """
116
+ files = prepare_upload_data(None, config)
117
+ r = self._session.patch(
118
+ f"{self.url}/api/v1/task/{task_id}",
119
+ files=files,
120
+ headers=self._headers()
121
+ )
122
+
123
+ r.raise_for_status()
124
+ return TaskResponse(**r.json()).with_client(self)
125
+
93
126
  def get_task(self, task_id: str) -> TaskResponse:
94
127
  """Get a task response by its ID.
95
128
 
@@ -106,3 +139,29 @@ class Chunkr(ChunkrBase):
106
139
  r.raise_for_status()
107
140
  return TaskResponse(**r.json()).with_client(self)
108
141
 
142
+
143
+ def delete_task(self, task_id: str) -> None:
144
+ """Delete a task by its ID.
145
+
146
+ Args:
147
+ task_id: The ID of the task to delete
148
+ """
149
+ r = self._session.delete(
150
+ f"{self.url}/api/v1/task/{task_id}",
151
+ headers=self._headers()
152
+ )
153
+ r.raise_for_status()
154
+
155
+ def cancel_task(self, task_id: str) -> None:
156
+ """Cancel a task by its ID.
157
+
158
+ Args:
159
+ task_id: The ID of the task to cancel
160
+ """
161
+ r = self._session.get(
162
+ f"{self.url}/api/v1/task/{task_id}/cancel",
163
+ headers=self._headers()
164
+ )
165
+ r.raise_for_status()
166
+
167
+
@@ -5,6 +5,7 @@ import httpx
5
5
  from pathlib import Path
6
6
  from PIL import Image
7
7
  from typing import Union, BinaryIO
8
+ from .misc import prepare_upload_data
8
9
 
9
10
  class ChunkrAsync(ChunkrBase):
10
11
  """Asynchronous Chunkr API client"""
@@ -43,10 +44,23 @@ class ChunkrAsync(ChunkrBase):
43
44
  Returns:
44
45
  TaskResponse: The completed task response
45
46
  """
46
- task = await self.start_upload(file, config)
47
+ task = await self.create_task(file, config)
47
48
  return await task.poll_async()
49
+
50
+ async def update(self, task_id: str, config: Configuration) -> TaskResponse:
51
+ """Update a task by its ID and wait for processing to complete.
52
+
53
+ Args:
54
+ task_id: The ID of the task to update
55
+ config: Configuration options for processing. Optional.
48
56
 
49
- async def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
57
+ Returns:
58
+ TaskResponse: The updated task response
59
+ """
60
+ task = await self.update_task(task_id, config)
61
+ return await task.poll_async()
62
+
63
+ async def create_task(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
50
64
  """Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll_async()`.
51
65
 
52
66
  Args:
@@ -80,16 +94,26 @@ class ChunkrAsync(ChunkrBase):
80
94
  Returns:
81
95
  TaskResponse: The initial task response
82
96
  """
83
- files, data = self._prepare_upload_data(file, config)
97
+ files = prepare_upload_data(file, config)
84
98
  r = await self._client.post(
85
99
  f"{self.url}/api/v1/task",
86
100
  files=files,
87
- json=config.model_dump() if config else {},
88
101
  headers=self._headers()
89
102
  )
90
103
  r.raise_for_status()
91
104
  return TaskResponse(**r.json()).with_client(self)
92
105
 
106
+ async def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
107
+ files = prepare_upload_data(None, config)
108
+ r = await self._client.patch(
109
+ f"{self.url}/api/v1/task/{task_id}",
110
+ files=files,
111
+ headers=self._headers()
112
+ )
113
+
114
+ r.raise_for_status()
115
+ return TaskResponse(**r.json()).with_client(self)
116
+
93
117
  async def get_task(self, task_id: str) -> TaskResponse:
94
118
  r = await self._client.get(
95
119
  f"{self.url}/api/v1/task/{task_id}",
@@ -97,7 +121,22 @@ class ChunkrAsync(ChunkrBase):
97
121
  )
98
122
  r.raise_for_status()
99
123
  return TaskResponse(**r.json()).with_client(self)
124
+
125
+ async def delete_task(self, task_id: str) -> None:
126
+ r = await self._client.delete(
127
+ f"{self.url}/api/v1/task/{task_id}",
128
+ headers=self._headers()
129
+ )
130
+ r.raise_for_status()
131
+
132
+ async def cancel_task(self, task_id: str) -> None:
133
+ r = await self._client.get(
134
+ f"{self.url}/api/v1/task/{task_id}/cancel",
135
+ headers=self._headers()
136
+ )
137
+ r.raise_for_status()
100
138
 
139
+
101
140
  async def __aenter__(self):
102
141
  return self
103
142
 
@@ -1,4 +1,4 @@
1
- from pydantic import BaseModel, Field, model_validator
1
+ from pydantic import BaseModel, Field, model_validator, ConfigDict
2
2
  from enum import Enum
3
3
  from typing import Optional, List, Dict
4
4
 
@@ -10,30 +10,30 @@ class CroppingStrategy(str, Enum):
10
10
  ALL = "All"
11
11
  AUTO = "Auto"
12
12
 
13
- class LlmConfig(BaseModel):
14
- model: str
15
- prompt: str
16
- temperature: float = 0.0
17
-
18
13
  class GenerationConfig(BaseModel):
19
14
  html: Optional[GenerationStrategy] = None
20
- llm: Optional[LlmConfig] = None
15
+ llm: Optional[str] = None
21
16
  markdown: Optional[GenerationStrategy] = None
22
17
  crop_image: Optional[CroppingStrategy] = None
23
18
 
24
19
  class SegmentProcessing(BaseModel):
25
- title: Optional[GenerationConfig] = None
26
- section_header: Optional[GenerationConfig] = None
27
- text: Optional[GenerationConfig] = None
28
- list_item: Optional[GenerationConfig] = None
29
- table: Optional[GenerationConfig] = None
30
- picture: Optional[GenerationConfig] = None
31
- caption: Optional[GenerationConfig] = None
32
- formula: Optional[GenerationConfig] = None
33
- footnote: Optional[GenerationConfig] = None
34
- page_header: Optional[GenerationConfig] = None
35
- page_footer: Optional[GenerationConfig] = None
36
- page: Optional[GenerationConfig] = None
20
+ model_config = ConfigDict(
21
+ populate_by_name=True,
22
+ alias_generator=str.title
23
+ )
24
+
25
+ title: Optional[GenerationConfig] = Field(default=None, alias="Title")
26
+ section_header: Optional[GenerationConfig] = Field(default=None, alias="SectionHeader")
27
+ text: Optional[GenerationConfig] = Field(default=None, alias="Text")
28
+ list_item: Optional[GenerationConfig] = Field(default=None, alias="ListItem")
29
+ table: Optional[GenerationConfig] = Field(default=None, alias="Table")
30
+ picture: Optional[GenerationConfig] = Field(default=None, alias="Picture")
31
+ caption: Optional[GenerationConfig] = Field(default=None, alias="Caption")
32
+ formula: Optional[GenerationConfig] = Field(default=None, alias="Formula")
33
+ footnote: Optional[GenerationConfig] = Field(default=None, alias="Footnote")
34
+ page_header: Optional[GenerationConfig] = Field(default=None, alias="PageHeader")
35
+ page_footer: Optional[GenerationConfig] = Field(default=None, alias="PageFooter")
36
+ page: Optional[GenerationConfig] = Field(default=None, alias="Page")
37
37
 
38
38
  class ChunkProcessing(BaseModel):
39
39
  target_length: Optional[int] = None
@@ -0,0 +1,106 @@
1
+ import io
2
+ import json
3
+ from pathlib import Path
4
+ from PIL import Image
5
+ import requests
6
+ from typing import Union, Tuple, BinaryIO, Optional
7
+ from .config import Configuration
8
+
9
+
10
+ def prepare_file(
11
+ file: Union[str, Path, BinaryIO, Image.Image]
12
+ ) -> Tuple[str, BinaryIO]:
13
+ """Convert various file types into a tuple of (filename, file-like object)."""
14
+ # Handle URLs
15
+ if isinstance(file, str) and (file.startswith('http://') or file.startswith('https://')):
16
+ response = requests.get(file)
17
+ response.raise_for_status()
18
+ file_obj = io.BytesIO(response.content)
19
+ filename = Path(file.split('/')[-1]).name or 'downloaded_file'
20
+ return filename, file_obj
21
+
22
+ # Handle base64 strings
23
+ if isinstance(file, str) and ',' in file and ';base64,' in file:
24
+ try:
25
+ # Split header and data
26
+ header, base64_data = file.split(',', 1)
27
+ import base64
28
+ file_bytes = base64.b64decode(base64_data)
29
+ file_obj = io.BytesIO(file_bytes)
30
+
31
+ # Try to determine format from header
32
+ format = 'bin'
33
+ mime_type = header.split(':')[-1].split(';')[0].lower()
34
+
35
+ # Map MIME types to file extensions
36
+ mime_to_ext = {
37
+ 'application/pdf': 'pdf',
38
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
39
+ 'application/msword': 'doc',
40
+ 'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
41
+ 'application/vnd.ms-powerpoint': 'ppt',
42
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
43
+ 'application/vnd.ms-excel': 'xls',
44
+ 'image/jpeg': 'jpg',
45
+ 'image/png': 'png',
46
+ 'image/jpg': 'jpg'
47
+ }
48
+
49
+ if mime_type in mime_to_ext:
50
+ format = mime_to_ext[mime_type]
51
+ else:
52
+ raise ValueError(f"Unsupported MIME type: {mime_type}")
53
+
54
+ return f"file.{format}", file_obj
55
+ except Exception as e:
56
+ raise ValueError(f"Invalid base64 string: {str(e)}")
57
+
58
+ # Handle file paths
59
+ if isinstance(file, (str, Path)):
60
+ path = Path(file).resolve()
61
+ if not path.exists():
62
+ raise FileNotFoundError(f"File not found: {file}")
63
+ return path.name, open(path, 'rb')
64
+
65
+ # Handle PIL Images
66
+ if isinstance(file, Image.Image):
67
+ img_byte_arr = io.BytesIO()
68
+ format = file.format or 'PNG'
69
+ file.save(img_byte_arr, format=format)
70
+ img_byte_arr.seek(0)
71
+ return f"image.{format.lower()}", img_byte_arr
72
+
73
+ # Handle file-like objects
74
+ if hasattr(file, 'read') and hasattr(file, 'seek'):
75
+ # Try to get the filename from the file object if possible
76
+ name = getattr(file, 'name', 'document') if hasattr(file, 'name') else 'document'
77
+ return Path(name).name, file
78
+
79
+ raise TypeError(f"Unsupported file type: {type(file)}")
80
+
81
+
82
+
83
+ def prepare_upload_data(
84
+ file: Optional[Union[str, Path, BinaryIO, Image.Image]] = None,
85
+ config: Optional[Configuration] = None
86
+ ) -> dict:
87
+ """Prepare files and data dictionaries for upload.
88
+
89
+ Args:
90
+ file: The file to upload
91
+ config: Optional configuration settings
92
+
93
+ Returns:
94
+ dict: (files dict) ready for upload
95
+ """
96
+ files = {}
97
+ if file:
98
+ filename, file_obj = prepare_file(file)
99
+ files = {"file": (filename, file_obj)}
100
+
101
+ if config:
102
+ config_dict = config.model_dump(mode="json", exclude_none=True)
103
+ for key, value in config_dict.items():
104
+ files[key] = (None, json.dumps(value), 'application/json')
105
+
106
+ return files
@@ -1,5 +1,6 @@
1
1
  from .protocol import ChunkrClientProtocol
2
2
  from .config import Configuration, OutputResponse
3
+ from .misc import prepare_upload_data
3
4
  import asyncio
4
5
  from datetime import datetime
5
6
  from enum import Enum
@@ -12,6 +13,7 @@ class Status(str, Enum):
12
13
  PROCESSING = "Processing"
13
14
  SUCCEEDED = "Succeeded"
14
15
  FAILED = "Failed"
16
+ CANCELLED = "Cancelled"
15
17
 
16
18
  class TaskResponse(BaseModel):
17
19
  configuration: Configuration
@@ -79,7 +81,8 @@ class TaskResponse(BaseModel):
79
81
  """Poll the task for completion."""
80
82
  while True:
81
83
  response = self._poll_request_sync()
82
- self.__dict__.update(response)
84
+ updated_task = TaskResponse(**response).with_client(self._client)
85
+ self.__dict__.update(updated_task.__dict__)
83
86
 
84
87
  if result := self._check_status():
85
88
  return result
@@ -90,7 +93,8 @@ class TaskResponse(BaseModel):
90
93
  """Poll the task for completion asynchronously."""
91
94
  while True:
92
95
  response = await self._poll_request_async()
93
- self.__dict__.update(response)
96
+ updated_task = TaskResponse(**response).with_client(self._client)
97
+ self.__dict__.update(updated_task.__dict__)
94
98
 
95
99
  if result := self._check_status():
96
100
  return result
@@ -108,6 +112,56 @@ class TaskResponse(BaseModel):
108
112
  if content:
109
113
  parts.append(content)
110
114
  return "\n".join(parts)
115
+
116
+ def update(self, config: Configuration) -> 'TaskResponse':
117
+ files = prepare_upload_data(None, config)
118
+ r = self._client._session.patch(
119
+ f"{self.task_url}",
120
+ files=files,
121
+ headers=self._client._headers()
122
+ )
123
+ r.raise_for_status()
124
+ return TaskResponse(**r.json()).with_client(self._client)
125
+
126
+ async def update_async(self, config: Configuration) -> 'TaskResponse':
127
+ files = prepare_upload_data(None, config)
128
+ r = await self._client._client.patch(
129
+ f"{self.task_url}",
130
+ files=files,
131
+ headers=self._client._headers()
132
+ )
133
+ r.raise_for_status()
134
+ return TaskResponse(**r.json()).with_client(self._client)
135
+
136
+ def cancel(self):
137
+ r = self._client._session.get(
138
+ f"{self.task_url}/cancel",
139
+ headers=self._client._headers()
140
+ )
141
+ r.raise_for_status()
142
+ self.poll()
143
+
144
+ async def cancel_async(self):
145
+ r = await self._client._client.get(
146
+ f"{self.task_url}/cancel",
147
+ headers=self._client._headers()
148
+ )
149
+ r.raise_for_status()
150
+ await self.poll_async()
151
+
152
+ def delete(self):
153
+ r = self._client._session.delete(
154
+ f"{self.task_url}",
155
+ headers=self._client._headers()
156
+ )
157
+ r.raise_for_status()
158
+
159
+ async def delete_async(self):
160
+ r = await self._client._client.delete(
161
+ f"{self.task_url}",
162
+ headers=self._client._headers()
163
+ )
164
+ r.raise_for_status()
111
165
 
112
166
  def html(self) -> str:
113
167
  """Get full HTML for the task"""
@@ -0,0 +1,12 @@
1
+ from chunkr_ai.api.chunkr import Chunkr
2
+ from chunkr_ai.models import Configuration
3
+ from chunkr_ai.api.config import SegmentationStrategy, ChunkProcessing
4
+
5
+ if __name__ == "__main__":
6
+ chunkr = Chunkr()
7
+ task = chunkr.update_task("556b4fe5-e3f7-48dc-9f56-0fb7fbacdb87", Configuration(
8
+ chunk_processing=ChunkProcessing(
9
+ target_length=1000
10
+ )
11
+ ))
12
+ print(task)
@@ -8,7 +8,6 @@ from .api.config import (
8
8
  GenerationStrategy,
9
9
  GenerationConfig,
10
10
  JsonSchema,
11
- LlmConfig,
12
11
  Model,
13
12
  OCRResult,
14
13
  OcrStrategy,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: chunkr-ai
3
- Version: 0.0.6
3
+ Version: 0.0.7
4
4
  Summary: Python client for Chunkr: open source document intelligence
5
5
  Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
6
  Project-URL: Homepage, https://chunkr.ai
@@ -16,6 +16,7 @@ src/chunkr_ai/api/base.py
16
16
  src/chunkr_ai/api/chunkr.py
17
17
  src/chunkr_ai/api/chunkr_async.py
18
18
  src/chunkr_ai/api/config.py
19
+ src/chunkr_ai/api/misc.py
19
20
  src/chunkr_ai/api/protocol.py
20
21
  src/chunkr_ai/api/task.py
21
22
  tests/test_chunkr.py
@@ -19,7 +19,7 @@ from chunkr_ai.models import (
19
19
 
20
20
  @pytest.fixture(params=[
21
21
  pytest.param(("sync", Chunkr()), id="sync"),
22
- pytest.param(("async", ChunkrAsync()), id="async")
22
+ # pytest.param(("async", ChunkrAsync()), id="async")
23
23
  ])
24
24
  def chunkr_client(request):
25
25
  return request.param
@@ -209,4 +209,121 @@ async def test_json_schema(chunkr_client, sample_path):
209
209
  assert response.status == "Succeeded"
210
210
  assert response.output is not None
211
211
 
212
-
212
+ @pytest.mark.asyncio
213
+ async def test_delete_task(chunkr_client, sample_path):
214
+ client_type, client = chunkr_client
215
+ response = await client.upload(sample_path) if client_type == "async" else client.upload(sample_path)
216
+ assert isinstance(response, TaskResponse)
217
+ assert response.task_id is not None
218
+ assert response.status == "Succeeded"
219
+ assert response.output is not None
220
+
221
+ if client_type == "async":
222
+ await client.delete_task(response.task_id)
223
+ with pytest.raises(Exception):
224
+ await client.get_task(response.task_id)
225
+ else:
226
+ client.delete_task(response.task_id)
227
+ with pytest.raises(Exception):
228
+ client.get_task(response.task_id)
229
+
230
+ @pytest.mark.asyncio
231
+ async def test_delete_task_direct(chunkr_client, sample_path):
232
+ client_type, client = chunkr_client
233
+ task = await client.upload(sample_path) if client_type == "async" else client.upload(sample_path)
234
+ assert isinstance(task, TaskResponse)
235
+ assert task.task_id is not None
236
+ assert task.status == "Succeeded"
237
+ assert task.output is not None
238
+
239
+ if client_type == "async":
240
+ await client.delete_task(task.task_id)
241
+ with pytest.raises(Exception):
242
+ await client.get_task(task.task_id)
243
+ else:
244
+ client.delete_task(task.task_id)
245
+ with pytest.raises(Exception):
246
+ client.get_task(task.task_id)
247
+
248
+ @pytest.mark.asyncio
249
+ async def test_cancel_task(chunkr_client, sample_path):
250
+ client_type, client = chunkr_client
251
+ response = await client.create_task(sample_path) if client_type == "async" else client.create_task(sample_path)
252
+ assert isinstance(response, TaskResponse)
253
+ assert response.task_id is not None
254
+ assert response.status == "Starting"
255
+
256
+ if client_type == "async":
257
+ await client.cancel_task(response.task_id)
258
+ assert (await client.get_task(response.task_id)).status == "Cancelled"
259
+ await response.poll_async()
260
+ else:
261
+ client.cancel_task(response.task_id)
262
+ assert client.get_task(response.task_id).status == "Cancelled"
263
+ response.poll()
264
+
265
+ assert response.output is None
266
+
267
+ @pytest.mark.asyncio
268
+ async def test_cancel_task_direct(chunkr_client, sample_path):
269
+ client_type, client = chunkr_client
270
+ task = await client.create_task(sample_path) if client_type == "async" else client.create_task(sample_path)
271
+ assert isinstance(task, TaskResponse)
272
+ assert task.task_id is not None
273
+ assert task.status == "Starting"
274
+
275
+ if client_type == "async":
276
+ await task.cancel_async()
277
+ else:
278
+ task.cancel()
279
+
280
+ assert task.status == "Cancelled"
281
+ assert task.output is None
282
+
283
+ @pytest.mark.asyncio
284
+ async def test_update_task(chunkr_client, sample_path):
285
+ client_type, client = chunkr_client
286
+ original_config = Configuration(
287
+ segmentation_strategy=SegmentationStrategy.LAYOUT_ANALYSIS,
288
+ )
289
+ new_config = Configuration(
290
+ segmentation_strategy=SegmentationStrategy.PAGE,
291
+ )
292
+ response = await client.upload(sample_path, original_config) if client_type == "async" else client.upload(sample_path, original_config)
293
+ assert isinstance(response, TaskResponse)
294
+ assert response.task_id is not None
295
+ assert response.status == "Succeeded"
296
+ assert response.output is not None
297
+
298
+ if client_type == "async":
299
+ task = await client.update(response.task_id, new_config)
300
+ else:
301
+ task = client.update(response.task_id, new_config)
302
+
303
+ assert task.status == "Succeeded"
304
+ assert task.output is not None
305
+ assert task.configuration.segmentation_strategy == SegmentationStrategy.PAGE
306
+
307
+ @pytest.mark.asyncio
308
+ async def test_update_task_direct(chunkr_client, sample_path):
309
+ client_type, client = chunkr_client
310
+ original_config = Configuration(
311
+ segmentation_strategy=SegmentationStrategy.LAYOUT_ANALYSIS,
312
+ )
313
+ new_config = Configuration(
314
+ segmentation_strategy=SegmentationStrategy.PAGE,
315
+ )
316
+ task = await client.upload(sample_path, original_config) if client_type == "async" else client.upload(sample_path, original_config)
317
+ assert isinstance(task, TaskResponse)
318
+ assert task.task_id is not None
319
+ assert task.status == "Succeeded"
320
+ assert task.output is not None
321
+
322
+ if client_type == "async":
323
+ await task.update_async(new_config)
324
+ else:
325
+ task.update(new_config)
326
+
327
+ assert task.status == "Succeeded"
328
+ assert task.output is not None
329
+ assert task.configuration.segmentation_strategy == SegmentationStrategy.PAGE
@@ -1,173 +0,0 @@
1
- from .config import Configuration
2
- from .task import TaskResponse
3
- from .auth import HeadersMixin
4
- from abc import abstractmethod
5
- from dotenv import load_dotenv
6
- import io
7
- import json
8
- import os
9
- from pathlib import Path
10
- from PIL import Image
11
- import requests
12
- from typing import BinaryIO, Tuple, Union
13
-
14
- class ChunkrBase(HeadersMixin):
15
- """Base class with shared functionality for Chunkr API clients."""
16
-
17
- def __init__(self, url: str = None, api_key: str = None):
18
- load_dotenv()
19
- self.url = (
20
- url or
21
- os.getenv('CHUNKR_URL') or
22
- 'https://api.chunkr.ai'
23
- )
24
- self._api_key = (
25
- api_key or
26
- os.getenv('CHUNKR_API_KEY')
27
- )
28
- if not self._api_key:
29
- raise ValueError("API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai")
30
-
31
- self.url = self.url.rstrip("/")
32
-
33
- def _prepare_file(
34
- self,
35
- file: Union[str, Path, BinaryIO, Image.Image]
36
- ) -> Tuple[str, BinaryIO]:
37
- """Convert various file types into a tuple of (filename, file-like object).
38
-
39
- Args:
40
- file: Input file, can be:
41
- - String or Path to a file
42
- - URL string starting with http:// or https://
43
- - Base64 string
44
- - Opened binary file (mode='rb')
45
- - PIL/Pillow Image object
46
-
47
- Returns:
48
- Tuple[str, BinaryIO]: (filename, file-like object) ready for upload
49
-
50
- Raises:
51
- FileNotFoundError: If the file path doesn't exist
52
- TypeError: If the file type is not supported
53
- ValueError: If the URL is invalid or unreachable
54
- ValueError: If the MIME type is unsupported
55
- """
56
- # Handle URLs
57
- if isinstance(file, str) and (file.startswith('http://') or file.startswith('https://')):
58
- response = requests.get(file)
59
- response.raise_for_status()
60
- file_obj = io.BytesIO(response.content)
61
- filename = Path(file.split('/')[-1]).name or 'downloaded_file'
62
- return filename, file_obj
63
-
64
- # Handle base64 strings
65
- if isinstance(file, str) and ',' in file and ';base64,' in file:
66
- try:
67
- # Split header and data
68
- header, base64_data = file.split(',', 1)
69
- import base64
70
- file_bytes = base64.b64decode(base64_data)
71
- file_obj = io.BytesIO(file_bytes)
72
-
73
- # Try to determine format from header
74
- format = 'bin'
75
- mime_type = header.split(':')[-1].split(';')[0].lower()
76
-
77
- # Map MIME types to file extensions
78
- mime_to_ext = {
79
- 'application/pdf': 'pdf',
80
- 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
81
- 'application/msword': 'doc',
82
- 'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
83
- 'application/vnd.ms-powerpoint': 'ppt',
84
- 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
85
- 'application/vnd.ms-excel': 'xls',
86
- 'image/jpeg': 'jpg',
87
- 'image/png': 'png',
88
- 'image/jpg': 'jpg'
89
- }
90
-
91
- if mime_type in mime_to_ext:
92
- format = mime_to_ext[mime_type]
93
- else:
94
- raise ValueError(f"Unsupported MIME type: {mime_type}")
95
-
96
- return f"file.{format}", file_obj
97
- except Exception as e:
98
- raise ValueError(f"Invalid base64 string: {str(e)}")
99
-
100
- # Handle file paths
101
- if isinstance(file, (str, Path)):
102
- path = Path(file).resolve()
103
- if not path.exists():
104
- raise FileNotFoundError(f"File not found: {file}")
105
- return path.name, open(path, 'rb')
106
-
107
- # Handle PIL Images
108
- if isinstance(file, Image.Image):
109
- img_byte_arr = io.BytesIO()
110
- format = file.format or 'PNG'
111
- file.save(img_byte_arr, format=format)
112
- img_byte_arr.seek(0)
113
- return f"image.{format.lower()}", img_byte_arr
114
-
115
- # Handle file-like objects
116
- if hasattr(file, 'read') and hasattr(file, 'seek'):
117
- # Try to get the filename from the file object if possible
118
- name = getattr(file, 'name', 'document') if hasattr(file, 'name') else 'document'
119
- return Path(name).name, file
120
-
121
- raise TypeError(f"Unsupported file type: {type(file)}")
122
-
123
- def _prepare_upload_data(
124
- self,
125
- file: Union[str, Path, BinaryIO, Image.Image],
126
- config: Configuration = None
127
- ) -> Tuple[dict, dict]:
128
- """Prepare files and data dictionaries for upload.
129
-
130
- Args:
131
- file: The file to upload
132
- config: Optional configuration settings
133
-
134
- Returns:
135
- Tuple[dict, dict]: (files dict, data dict) ready for upload
136
- """
137
- filename, file_obj = self._prepare_file(file)
138
- files = {"file": (filename, file_obj)}
139
- data = {}
140
-
141
- if config:
142
- config_dict = config.model_dump(mode="json", exclude_none=True)
143
- for key, value in config_dict.items():
144
- if isinstance(value, dict):
145
- files[key] = (None, json.dumps(value), 'application/json')
146
- else:
147
- data[key] = value
148
-
149
- return files, data
150
-
151
- @abstractmethod
152
- def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
153
- """Upload a file and wait for processing to complete.
154
-
155
- Must be implemented by subclasses.
156
- """
157
- pass
158
-
159
- @abstractmethod
160
- def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
161
- """Upload a file for processing and immediately return the task response.
162
-
163
- Must be implemented by subclasses.
164
- """
165
- pass
166
-
167
- @abstractmethod
168
- def get_task(self, task_id: str) -> TaskResponse:
169
- """Get a task response by its ID.
170
-
171
- Must be implemented by subclasses.
172
- """
173
- pass
File without changes
File without changes
File without changes
File without changes