chunkr-ai 0.0.12__py3-none-any.whl → 0.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,6 @@
1
1
  from .config import Configuration
2
2
  from .task import TaskResponse
3
+ from .task_async import TaskResponseAsync
3
4
  from .auth import HeadersMixin
4
5
  from abc import abstractmethod
5
6
  from dotenv import load_dotenv
@@ -8,78 +9,152 @@ from pathlib import Path
8
9
  from PIL import Image
9
10
  from typing import BinaryIO, Union
10
11
 
12
+
11
13
  class ChunkrBase(HeadersMixin):
12
14
  """Base class with shared functionality for Chunkr API clients."""
13
15
 
14
16
  def __init__(self, url: str = None, api_key: str = None):
15
17
  load_dotenv()
16
- self.url = (
17
- url or
18
- os.getenv('CHUNKR_URL') or
19
- 'https://api.chunkr.ai'
20
- )
21
- self._api_key = (
22
- api_key or
23
- os.getenv('CHUNKR_API_KEY')
24
- )
18
+ self.url = url or os.getenv("CHUNKR_URL") or "https://api.chunkr.ai"
19
+ self._api_key = api_key or os.getenv("CHUNKR_API_KEY")
25
20
  if not self._api_key:
26
- raise ValueError("API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai")
27
-
21
+ raise ValueError(
22
+ "API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai"
23
+ )
24
+
28
25
  self.url = self.url.rstrip("/")
29
26
 
30
27
  @abstractmethod
31
- def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
28
+ def upload(
29
+ self,
30
+ file: Union[str, Path, BinaryIO, Image.Image],
31
+ config: Configuration = None,
32
+ ) -> Union[TaskResponse, TaskResponseAsync]:
32
33
  """Upload a file and wait for processing to complete.
33
-
34
- Must be implemented by subclasses.
34
+
35
+ Args:
36
+ file: The file to upload.
37
+ config: Configuration options for processing. Optional.
38
+
39
+ Examples:
40
+ ```python
41
+ # Upload from file path
42
+ await chunkr.upload("document.pdf")
43
+
44
+ # Upload from opened file
45
+ with open("document.pdf", "rb") as f:
46
+ await chunkr.upload(f)
47
+
48
+ # Upload from URL
49
+ await chunkr.upload("https://example.com/document.pdf")
50
+
51
+ # Upload from base64 string (must include MIME type header)
52
+ await chunkr.upload("data:application/pdf;base64,JVBERi0...")
53
+
54
+ # Upload an image
55
+ from PIL import Image
56
+ img = Image.open("photo.jpg")
57
+ await chunkr.upload(img)
58
+ ```
59
+ Returns:
60
+ TaskResponse: The completed task response
35
61
  """
36
62
  pass
37
-
63
+
38
64
  @abstractmethod
39
- def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
40
- """Update a task by its ID.
41
-
42
- Must be implemented by subclasses.
65
+ def update(
66
+ self, task_id: str, config: Configuration
67
+ ) -> Union[TaskResponse, TaskResponseAsync]:
68
+ """Update a task by its ID and wait for processing to complete.
69
+
70
+ Args:
71
+ task_id: The ID of the task to update
72
+ config: Configuration options for processing. Optional.
73
+
74
+ Returns:
75
+ TaskResponse: The updated task response
43
76
  """
44
77
  pass
45
78
 
46
79
  @abstractmethod
47
- def create_task(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
48
- """Upload a file for processing and immediately return the task response.
49
-
50
- Must be implemented by subclasses.
80
+ def create_task(
81
+ self,
82
+ file: Union[str, Path, BinaryIO, Image.Image],
83
+ config: Configuration = None,
84
+ ) -> Union[TaskResponse, TaskResponseAsync]:
85
+ """Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`.
86
+
87
+ Args:
88
+ file: The file to upload.
89
+ config: Configuration options for processing. Optional.
90
+
91
+ Examples:
92
+ ```
93
+ # Upload from file path
94
+ task = await chunkr.create_task("document.pdf")
95
+
96
+ # Upload from opened file
97
+ with open("document.pdf", "rb") as f:
98
+ task = await chunkr.create_task(f)
99
+
100
+ # Upload from URL
101
+ task = await chunkr.create_task("https://example.com/document.pdf")
102
+
103
+ # Upload from base64 string (must include MIME type header)
104
+ task = await chunkr.create_task("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
105
+
106
+ # Upload an image
107
+ from PIL import Image
108
+ img = Image.open("photo.jpg")
109
+ task = await chunkr.create_task(img)
110
+
111
+ # Wait for the task to complete - this can be done when needed
112
+ await task.poll()
113
+ ```
51
114
  """
52
115
  pass
53
116
 
54
117
  @abstractmethod
55
- def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
56
- """Update a task by its ID.
57
-
58
- Must be implemented by subclasses.
118
+ def update_task(
119
+ self, task_id: str, config: Configuration
120
+ ) -> Union[TaskResponse, TaskResponseAsync]:
121
+ """Update a task by its ID and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`.
122
+
123
+ Args:
124
+ task_id: The ID of the task to update
125
+ config: Configuration options for processing. Optional.
126
+
127
+ Returns:
128
+ TaskResponse: The updated task response
59
129
  """
60
130
  pass
61
-
131
+
62
132
  @abstractmethod
63
- def get_task(self, task_id: str) -> TaskResponse:
133
+ def get_task(self, task_id: str) -> Union[TaskResponse, TaskResponseAsync]:
64
134
  """Get a task response by its ID.
65
-
66
- Must be implemented by subclasses.
135
+
136
+ Args:
137
+ task_id: The ID of the task to get
138
+
139
+ Returns:
140
+ TaskResponse: The task response
67
141
  """
68
142
  pass
69
143
 
70
144
  @abstractmethod
71
145
  def delete_task(self, task_id: str) -> None:
72
146
  """Delete a task by its ID.
73
-
74
- Must be implemented by subclasses.
147
+
148
+ Args:
149
+ task_id: The ID of the task to delete
75
150
  """
76
151
  pass
77
-
152
+
78
153
  @abstractmethod
79
154
  def cancel_task(self, task_id: str) -> None:
80
155
  """Cancel a task by its ID.
81
-
82
- Must be implemented by subclasses.
156
+
157
+ Args:
158
+ task_id: The ID of the task to cancel
83
159
  """
84
160
  pass
85
-
chunkr_ai/api/config.py CHANGED
@@ -3,28 +3,31 @@ from enum import Enum
3
3
  from typing import Optional, List, Dict, Union, Type
4
4
  from .schema import from_pydantic
5
5
 
6
+
6
7
  class GenerationStrategy(str, Enum):
7
8
  LLM = "LLM"
8
9
  AUTO = "Auto"
9
10
 
11
+
10
12
  class CroppingStrategy(str, Enum):
11
- ALL = "All"
13
+ ALL = "All"
12
14
  AUTO = "Auto"
13
15
 
16
+
14
17
  class GenerationConfig(BaseModel):
15
18
  html: Optional[GenerationStrategy] = None
16
19
  llm: Optional[str] = None
17
20
  markdown: Optional[GenerationStrategy] = None
18
21
  crop_image: Optional[CroppingStrategy] = None
19
22
 
23
+
20
24
  class SegmentProcessing(BaseModel):
21
- model_config = ConfigDict(
22
- populate_by_name=True,
23
- alias_generator=str.title
24
- )
25
-
25
+ model_config = ConfigDict(populate_by_name=True, alias_generator=str.title)
26
+
26
27
  title: Optional[GenerationConfig] = Field(default=None, alias="Title")
27
- section_header: Optional[GenerationConfig] = Field(default=None, alias="SectionHeader")
28
+ section_header: Optional[GenerationConfig] = Field(
29
+ default=None, alias="SectionHeader"
30
+ )
28
31
  text: Optional[GenerationConfig] = Field(default=None, alias="Text")
29
32
  list_item: Optional[GenerationConfig] = Field(default=None, alias="ListItem")
30
33
  table: Optional[GenerationConfig] = Field(default=None, alias="Table")
@@ -36,38 +39,46 @@ class SegmentProcessing(BaseModel):
36
39
  page_footer: Optional[GenerationConfig] = Field(default=None, alias="PageFooter")
37
40
  page: Optional[GenerationConfig] = Field(default=None, alias="Page")
38
41
 
42
+
39
43
  class ChunkProcessing(BaseModel):
40
44
  target_length: Optional[int] = None
41
45
 
46
+
42
47
  class Property(BaseModel):
43
48
  name: str
44
49
  prop_type: str
45
50
  description: Optional[str] = None
46
51
  default: Optional[str] = None
47
52
 
53
+
48
54
  class JsonSchema(BaseModel):
49
55
  title: str
50
56
  properties: List[Property]
51
57
 
58
+
52
59
  class OcrStrategy(str, Enum):
53
60
  ALL = "All"
54
61
  AUTO = "Auto"
55
-
62
+
63
+
56
64
  class SegmentationStrategy(str, Enum):
57
65
  LAYOUT_ANALYSIS = "LayoutAnalysis"
58
66
  PAGE = "Page"
59
67
 
68
+
60
69
  class BoundingBox(BaseModel):
61
70
  left: float
62
71
  top: float
63
72
  width: float
64
73
  height: float
65
74
 
75
+
66
76
  class OCRResult(BaseModel):
67
77
  bbox: BoundingBox
68
78
  text: str
69
79
  confidence: Optional[float]
70
80
 
81
+
71
82
  class SegmentType(str, Enum):
72
83
  CAPTION = "Caption"
73
84
  FOOTNOTE = "Footnote"
@@ -82,6 +93,7 @@ class SegmentType(str, Enum):
82
93
  TEXT = "Text"
83
94
  TITLE = "Title"
84
95
 
96
+
85
97
  class Segment(BaseModel):
86
98
  bbox: BoundingBox
87
99
  content: str
@@ -95,33 +107,40 @@ class Segment(BaseModel):
95
107
  segment_id: str
96
108
  segment_type: SegmentType
97
109
 
110
+
98
111
  class Chunk(BaseModel):
99
112
  chunk_id: str
100
113
  chunk_length: int
101
114
  segments: List[Segment]
102
115
 
116
+
103
117
  class ExtractedJson(BaseModel):
104
118
  data: Dict
105
119
 
120
+
106
121
  class OutputResponse(BaseModel):
107
122
  chunks: List[Chunk]
108
123
  extracted_json: Optional[ExtractedJson] = Field(default=None)
109
124
 
125
+
110
126
  class Model(str, Enum):
111
127
  FAST = "Fast"
112
128
  HIGH_QUALITY = "HighQuality"
113
129
 
130
+
114
131
  class Configuration(BaseModel):
115
132
  chunk_processing: Optional[ChunkProcessing] = Field(default=None)
116
133
  expires_in: Optional[int] = Field(default=None)
117
134
  high_resolution: Optional[bool] = Field(default=None)
118
- json_schema: Optional[Union[JsonSchema, Type[BaseModel], BaseModel]] = Field(default=None)
135
+ json_schema: Optional[Union[JsonSchema, Type[BaseModel], BaseModel]] = Field(
136
+ default=None
137
+ )
119
138
  model: Optional[Model] = Field(default=None)
120
139
  ocr_strategy: Optional[OcrStrategy] = Field(default=None)
121
140
  segment_processing: Optional[SegmentProcessing] = Field(default=None)
122
141
  segmentation_strategy: Optional[SegmentationStrategy] = Field(default=None)
123
142
 
124
- @model_validator(mode='before')
143
+ @model_validator(mode="before")
125
144
  def map_deprecated_fields(cls, values: Dict) -> Dict:
126
145
  if isinstance(values, dict) and "target_chunk_length" in values:
127
146
  target_length = values.pop("target_chunk_length")
@@ -130,13 +149,18 @@ class Configuration(BaseModel):
130
149
  values["chunk_processing"]["target_length"] = target_length
131
150
  return values
132
151
 
133
- @model_validator(mode='after')
134
- def convert_json_schema(self) -> 'Configuration':
135
- if self.json_schema is not None and not isinstance(self.json_schema, JsonSchema):
136
- if isinstance(self.json_schema, (BaseModel, type)) and issubclass(getattr(self.json_schema, '__class__', type), BaseModel):
152
+ @model_validator(mode="after")
153
+ def convert_json_schema(self) -> "Configuration":
154
+ if self.json_schema is not None and not isinstance(
155
+ self.json_schema, JsonSchema
156
+ ):
157
+ if isinstance(self.json_schema, (BaseModel, type)) and issubclass(
158
+ getattr(self.json_schema, "__class__", type), BaseModel
159
+ ):
137
160
  self.json_schema = JsonSchema(**from_pydantic(self.json_schema))
138
161
  return self
139
162
 
163
+
140
164
  class Status(str, Enum):
141
165
  STARTING = "Starting"
142
166
  PROCESSING = "Processing"
chunkr_ai/api/misc.py CHANGED
@@ -6,68 +6,74 @@ from PIL import Image
6
6
  import requests
7
7
  from typing import Union, Tuple, BinaryIO, Optional
8
8
 
9
- def prepare_file(
10
- file: Union[str, Path, BinaryIO, Image.Image]
11
- ) -> Tuple[str, BinaryIO]:
9
+
10
+ def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[str, BinaryIO]:
12
11
  """Convert various file types into a tuple of (filename, file-like object)."""
13
12
  # Handle URLs
14
- if isinstance(file, str) and (file.startswith('http://') or file.startswith('https://')):
13
+ if isinstance(file, str) and (
14
+ file.startswith("http://") or file.startswith("https://")
15
+ ):
15
16
  response = requests.get(file)
16
17
  response.raise_for_status()
17
-
18
+
18
19
  # Try to get filename from Content-Disposition header first
19
20
  filename = None
20
- content_disposition = response.headers.get('Content-Disposition')
21
- if content_disposition and 'filename=' in content_disposition:
22
- filename = content_disposition.split('filename=')[-1].strip('"\'')
23
-
21
+ content_disposition = response.headers.get("Content-Disposition")
22
+ if content_disposition and "filename=" in content_disposition:
23
+ filename = content_disposition.split("filename=")[-1].strip("\"'")
24
+
24
25
  # If no Content-Disposition, try to get clean filename from URL path
25
26
  if not filename:
26
27
  from urllib.parse import urlparse, unquote
28
+
27
29
  parsed_url = urlparse(file)
28
30
  path = unquote(parsed_url.path)
29
31
  filename = Path(path).name if path else None
30
-
32
+
31
33
  # Fallback to default name if we couldn't extract one
32
- filename = filename or 'downloaded_file'
33
-
34
+ filename = filename or "downloaded_file"
35
+
34
36
  # Sanitize filename: remove invalid characters and limit length
35
37
  import re
36
- filename = re.sub(r'[<>:"/\\|?*%]', '_', filename) # Replace invalid chars with underscore
37
- filename = re.sub(r'\s+', '_', filename) # Replace whitespace with underscore
38
- filename = filename.strip('._') # Remove leading/trailing dots and underscores
39
- filename = filename[:255] # Limit length to 255 characters
40
-
38
+
39
+ filename = re.sub(
40
+ r'[<>:"/\\|?*%]', "_", filename
41
+ ) # Replace invalid chars with underscore
42
+ filename = re.sub(r"\s+", "_", filename) # Replace whitespace with underscore
43
+ filename = filename.strip("._") # Remove leading/trailing dots and underscores
44
+ filename = filename[:255] # Limit length to 255 characters
45
+
41
46
  file_obj = io.BytesIO(response.content)
42
47
  return filename, file_obj
43
48
 
44
49
  # Handle base64 strings
45
- if isinstance(file, str) and ',' in file and ';base64,' in file:
50
+ if isinstance(file, str) and "," in file and ";base64," in file:
46
51
  try:
47
52
  # Split header and data
48
- header, base64_data = file.split(',', 1)
53
+ header, base64_data = file.split(",", 1)
49
54
  import base64
55
+
50
56
  file_bytes = base64.b64decode(base64_data)
51
57
  file_obj = io.BytesIO(file_bytes)
52
-
58
+
53
59
  # Try to determine format from header
54
- format = 'bin'
55
- mime_type = header.split(':')[-1].split(';')[0].lower()
56
-
60
+ format = "bin"
61
+ mime_type = header.split(":")[-1].split(";")[0].lower()
62
+
57
63
  # Map MIME types to file extensions
58
64
  mime_to_ext = {
59
- 'application/pdf': 'pdf',
60
- 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
61
- 'application/msword': 'doc',
62
- 'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
63
- 'application/vnd.ms-powerpoint': 'ppt',
64
- 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
65
- 'application/vnd.ms-excel': 'xls',
66
- 'image/jpeg': 'jpg',
67
- 'image/png': 'png',
68
- 'image/jpg': 'jpg'
65
+ "application/pdf": "pdf",
66
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
67
+ "application/msword": "doc",
68
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
69
+ "application/vnd.ms-powerpoint": "ppt",
70
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
71
+ "application/vnd.ms-excel": "xls",
72
+ "image/jpeg": "jpg",
73
+ "image/png": "png",
74
+ "image/jpg": "jpg",
69
75
  }
70
-
76
+
71
77
  if mime_type in mime_to_ext:
72
78
  format = mime_to_ext[mime_type]
73
79
  else:
@@ -82,36 +88,37 @@ def prepare_file(
82
88
  path = Path(file).resolve()
83
89
  if not path.exists():
84
90
  raise FileNotFoundError(f"File not found: {file}")
85
- return path.name, open(path, 'rb')
91
+ return path.name, open(path, "rb")
86
92
 
87
93
  # Handle PIL Images
88
94
  if isinstance(file, Image.Image):
89
95
  img_byte_arr = io.BytesIO()
90
- format = file.format or 'PNG'
96
+ format = file.format or "PNG"
91
97
  file.save(img_byte_arr, format=format)
92
98
  img_byte_arr.seek(0)
93
99
  return f"image.{format.lower()}", img_byte_arr
94
100
 
95
101
  # Handle file-like objects
96
- if hasattr(file, 'read') and hasattr(file, 'seek'):
102
+ if hasattr(file, "read") and hasattr(file, "seek"):
97
103
  # Try to get the filename from the file object if possible
98
- name = getattr(file, 'name', 'document') if hasattr(file, 'name') else 'document'
104
+ name = (
105
+ getattr(file, "name", "document") if hasattr(file, "name") else "document"
106
+ )
99
107
  return Path(name).name, file
100
108
 
101
109
  raise TypeError(f"Unsupported file type: {type(file)}")
102
110
 
103
111
 
104
-
105
112
  def prepare_upload_data(
106
113
  file: Optional[Union[str, Path, BinaryIO, Image.Image]] = None,
107
- config: Optional[Configuration] = None
114
+ config: Optional[Configuration] = None,
108
115
  ) -> dict:
109
116
  """Prepare files and data dictionaries for upload.
110
-
117
+
111
118
  Args:
112
119
  file: The file to upload
113
120
  config: Optional configuration settings
114
-
121
+
115
122
  Returns:
116
123
  dict: (files dict) ready for upload
117
124
  """
@@ -123,6 +130,6 @@ def prepare_upload_data(
123
130
  if config:
124
131
  config_dict = config.model_dump(mode="json", exclude_none=True)
125
132
  for key, value in config_dict.items():
126
- files[key] = (None, json.dumps(value), 'application/json')
127
-
133
+ files[key] = (None, json.dumps(value), "application/json")
134
+
128
135
  return files
chunkr_ai/api/protocol.py CHANGED
@@ -1,14 +1,16 @@
1
- from typing import runtime_checkable, Protocol
1
+ from typing import Optional, runtime_checkable, Protocol
2
2
  from requests import Session
3
3
  from httpx import AsyncClient
4
4
 
5
+
5
6
  @runtime_checkable
6
7
  class ChunkrClientProtocol(Protocol):
7
8
  """Protocol defining the interface for Chunkr clients"""
9
+
8
10
  url: str
9
11
  _api_key: str
10
- _session: Session
11
- _client: AsyncClient
12
+ _session: Optional[Session] = None
13
+ _client: Optional[AsyncClient] = None
12
14
 
13
15
  def get_api_key(self) -> str:
14
16
  """Get the API key"""
@@ -16,4 +18,4 @@ class ChunkrClientProtocol(Protocol):
16
18
 
17
19
  def _headers(self) -> dict:
18
20
  """Return headers required for API requests"""
19
- ...
21
+ ...