chunkr-ai 0.0.12__py3-none-any.whl → 0.0.15__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,6 @@
1
1
  from .config import Configuration
2
2
  from .task import TaskResponse
3
+ from .task_async import TaskResponseAsync
3
4
  from .auth import HeadersMixin
4
5
  from abc import abstractmethod
5
6
  from dotenv import load_dotenv
@@ -8,78 +9,152 @@ from pathlib import Path
8
9
  from PIL import Image
9
10
  from typing import BinaryIO, Union
10
11
 
12
+
11
13
  class ChunkrBase(HeadersMixin):
12
14
  """Base class with shared functionality for Chunkr API clients."""
13
15
 
14
16
  def __init__(self, url: str = None, api_key: str = None):
15
17
  load_dotenv()
16
- self.url = (
17
- url or
18
- os.getenv('CHUNKR_URL') or
19
- 'https://api.chunkr.ai'
20
- )
21
- self._api_key = (
22
- api_key or
23
- os.getenv('CHUNKR_API_KEY')
24
- )
18
+ self.url = url or os.getenv("CHUNKR_URL") or "https://api.chunkr.ai"
19
+ self._api_key = api_key or os.getenv("CHUNKR_API_KEY")
25
20
  if not self._api_key:
26
- raise ValueError("API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai")
27
-
21
+ raise ValueError(
22
+ "API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai"
23
+ )
24
+
28
25
  self.url = self.url.rstrip("/")
29
26
 
30
27
  @abstractmethod
31
- def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
28
+ def upload(
29
+ self,
30
+ file: Union[str, Path, BinaryIO, Image.Image],
31
+ config: Configuration = None,
32
+ ) -> Union[TaskResponse, TaskResponseAsync]:
32
33
  """Upload a file and wait for processing to complete.
33
-
34
- Must be implemented by subclasses.
34
+
35
+ Args:
36
+ file: The file to upload.
37
+ config: Configuration options for processing. Optional.
38
+
39
+ Examples:
40
+ ```python
41
+ # Upload from file path
42
+ await chunkr.upload("document.pdf")
43
+
44
+ # Upload from opened file
45
+ with open("document.pdf", "rb") as f:
46
+ await chunkr.upload(f)
47
+
48
+ # Upload from URL
49
+ await chunkr.upload("https://example.com/document.pdf")
50
+
51
+ # Upload from base64 string (must include MIME type header)
52
+ await chunkr.upload("data:application/pdf;base64,JVBERi0...")
53
+
54
+ # Upload an image
55
+ from PIL import Image
56
+ img = Image.open("photo.jpg")
57
+ await chunkr.upload(img)
58
+ ```
59
+ Returns:
60
+ TaskResponse: The completed task response
35
61
  """
36
62
  pass
37
-
63
+
38
64
  @abstractmethod
39
- def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
40
- """Update a task by its ID.
41
-
42
- Must be implemented by subclasses.
65
+ def update(
66
+ self, task_id: str, config: Configuration
67
+ ) -> Union[TaskResponse, TaskResponseAsync]:
68
+ """Update a task by its ID and wait for processing to complete.
69
+
70
+ Args:
71
+ task_id: The ID of the task to update
72
+ config: Configuration options for processing. Optional.
73
+
74
+ Returns:
75
+ TaskResponse: The updated task response
43
76
  """
44
77
  pass
45
78
 
46
79
  @abstractmethod
47
- def create_task(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
48
- """Upload a file for processing and immediately return the task response.
49
-
50
- Must be implemented by subclasses.
80
+ def create_task(
81
+ self,
82
+ file: Union[str, Path, BinaryIO, Image.Image],
83
+ config: Configuration = None,
84
+ ) -> Union[TaskResponse, TaskResponseAsync]:
85
+ """Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`.
86
+
87
+ Args:
88
+ file: The file to upload.
89
+ config: Configuration options for processing. Optional.
90
+
91
+ Examples:
92
+ ```
93
+ # Upload from file path
94
+ task = await chunkr.create_task("document.pdf")
95
+
96
+ # Upload from opened file
97
+ with open("document.pdf", "rb") as f:
98
+ task = await chunkr.create_task(f)
99
+
100
+ # Upload from URL
101
+ task = await chunkr.create_task("https://example.com/document.pdf")
102
+
103
+ # Upload from base64 string (must include MIME type header)
104
+ task = await chunkr.create_task("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
105
+
106
+ # Upload an image
107
+ from PIL import Image
108
+ img = Image.open("photo.jpg")
109
+ task = await chunkr.create_task(img)
110
+
111
+ # Wait for the task to complete - this can be done when needed
112
+ await task.poll()
113
+ ```
51
114
  """
52
115
  pass
53
116
 
54
117
  @abstractmethod
55
- def update_task(self, task_id: str, config: Configuration) -> TaskResponse:
56
- """Update a task by its ID.
57
-
58
- Must be implemented by subclasses.
118
+ def update_task(
119
+ self, task_id: str, config: Configuration
120
+ ) -> Union[TaskResponse, TaskResponseAsync]:
121
+ """Update a task by its ID and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`.
122
+
123
+ Args:
124
+ task_id: The ID of the task to update
125
+ config: Configuration options for processing. Optional.
126
+
127
+ Returns:
128
+ TaskResponse: The updated task response
59
129
  """
60
130
  pass
61
-
131
+
62
132
  @abstractmethod
63
- def get_task(self, task_id: str) -> TaskResponse:
133
+ def get_task(self, task_id: str) -> Union[TaskResponse, TaskResponseAsync]:
64
134
  """Get a task response by its ID.
65
-
66
- Must be implemented by subclasses.
135
+
136
+ Args:
137
+ task_id: The ID of the task to get
138
+
139
+ Returns:
140
+ TaskResponse: The task response
67
141
  """
68
142
  pass
69
143
 
70
144
  @abstractmethod
71
145
  def delete_task(self, task_id: str) -> None:
72
146
  """Delete a task by its ID.
73
-
74
- Must be implemented by subclasses.
147
+
148
+ Args:
149
+ task_id: The ID of the task to delete
75
150
  """
76
151
  pass
77
-
152
+
78
153
  @abstractmethod
79
154
  def cancel_task(self, task_id: str) -> None:
80
155
  """Cancel a task by its ID.
81
-
82
- Must be implemented by subclasses.
156
+
157
+ Args:
158
+ task_id: The ID of the task to cancel
83
159
  """
84
160
  pass
85
-
chunkr_ai/api/config.py CHANGED
@@ -3,28 +3,31 @@ from enum import Enum
3
3
  from typing import Optional, List, Dict, Union, Type
4
4
  from .schema import from_pydantic
5
5
 
6
+
6
7
  class GenerationStrategy(str, Enum):
7
8
  LLM = "LLM"
8
9
  AUTO = "Auto"
9
10
 
11
+
10
12
  class CroppingStrategy(str, Enum):
11
- ALL = "All"
13
+ ALL = "All"
12
14
  AUTO = "Auto"
13
15
 
16
+
14
17
  class GenerationConfig(BaseModel):
15
18
  html: Optional[GenerationStrategy] = None
16
19
  llm: Optional[str] = None
17
20
  markdown: Optional[GenerationStrategy] = None
18
21
  crop_image: Optional[CroppingStrategy] = None
19
22
 
23
+
20
24
  class SegmentProcessing(BaseModel):
21
- model_config = ConfigDict(
22
- populate_by_name=True,
23
- alias_generator=str.title
24
- )
25
-
25
+ model_config = ConfigDict(populate_by_name=True, alias_generator=str.title)
26
+
26
27
  title: Optional[GenerationConfig] = Field(default=None, alias="Title")
27
- section_header: Optional[GenerationConfig] = Field(default=None, alias="SectionHeader")
28
+ section_header: Optional[GenerationConfig] = Field(
29
+ default=None, alias="SectionHeader"
30
+ )
28
31
  text: Optional[GenerationConfig] = Field(default=None, alias="Text")
29
32
  list_item: Optional[GenerationConfig] = Field(default=None, alias="ListItem")
30
33
  table: Optional[GenerationConfig] = Field(default=None, alias="Table")
@@ -36,38 +39,46 @@ class SegmentProcessing(BaseModel):
36
39
  page_footer: Optional[GenerationConfig] = Field(default=None, alias="PageFooter")
37
40
  page: Optional[GenerationConfig] = Field(default=None, alias="Page")
38
41
 
42
+
39
43
  class ChunkProcessing(BaseModel):
40
44
  target_length: Optional[int] = None
41
45
 
46
+
42
47
  class Property(BaseModel):
43
48
  name: str
44
49
  prop_type: str
45
50
  description: Optional[str] = None
46
51
  default: Optional[str] = None
47
52
 
53
+
48
54
  class JsonSchema(BaseModel):
49
55
  title: str
50
56
  properties: List[Property]
51
57
 
58
+
52
59
  class OcrStrategy(str, Enum):
53
60
  ALL = "All"
54
61
  AUTO = "Auto"
55
-
62
+
63
+
56
64
  class SegmentationStrategy(str, Enum):
57
65
  LAYOUT_ANALYSIS = "LayoutAnalysis"
58
66
  PAGE = "Page"
59
67
 
68
+
60
69
  class BoundingBox(BaseModel):
61
70
  left: float
62
71
  top: float
63
72
  width: float
64
73
  height: float
65
74
 
75
+
66
76
  class OCRResult(BaseModel):
67
77
  bbox: BoundingBox
68
78
  text: str
69
79
  confidence: Optional[float]
70
80
 
81
+
71
82
  class SegmentType(str, Enum):
72
83
  CAPTION = "Caption"
73
84
  FOOTNOTE = "Footnote"
@@ -82,6 +93,7 @@ class SegmentType(str, Enum):
82
93
  TEXT = "Text"
83
94
  TITLE = "Title"
84
95
 
96
+
85
97
  class Segment(BaseModel):
86
98
  bbox: BoundingBox
87
99
  content: str
@@ -95,33 +107,40 @@ class Segment(BaseModel):
95
107
  segment_id: str
96
108
  segment_type: SegmentType
97
109
 
110
+
98
111
  class Chunk(BaseModel):
99
112
  chunk_id: str
100
113
  chunk_length: int
101
114
  segments: List[Segment]
102
115
 
116
+
103
117
  class ExtractedJson(BaseModel):
104
118
  data: Dict
105
119
 
120
+
106
121
  class OutputResponse(BaseModel):
107
122
  chunks: List[Chunk]
108
123
  extracted_json: Optional[ExtractedJson] = Field(default=None)
109
124
 
125
+
110
126
  class Model(str, Enum):
111
127
  FAST = "Fast"
112
128
  HIGH_QUALITY = "HighQuality"
113
129
 
130
+
114
131
  class Configuration(BaseModel):
115
132
  chunk_processing: Optional[ChunkProcessing] = Field(default=None)
116
133
  expires_in: Optional[int] = Field(default=None)
117
134
  high_resolution: Optional[bool] = Field(default=None)
118
- json_schema: Optional[Union[JsonSchema, Type[BaseModel], BaseModel]] = Field(default=None)
135
+ json_schema: Optional[Union[JsonSchema, Type[BaseModel], BaseModel]] = Field(
136
+ default=None
137
+ )
119
138
  model: Optional[Model] = Field(default=None)
120
139
  ocr_strategy: Optional[OcrStrategy] = Field(default=None)
121
140
  segment_processing: Optional[SegmentProcessing] = Field(default=None)
122
141
  segmentation_strategy: Optional[SegmentationStrategy] = Field(default=None)
123
142
 
124
- @model_validator(mode='before')
143
+ @model_validator(mode="before")
125
144
  def map_deprecated_fields(cls, values: Dict) -> Dict:
126
145
  if isinstance(values, dict) and "target_chunk_length" in values:
127
146
  target_length = values.pop("target_chunk_length")
@@ -130,13 +149,18 @@ class Configuration(BaseModel):
130
149
  values["chunk_processing"]["target_length"] = target_length
131
150
  return values
132
151
 
133
- @model_validator(mode='after')
134
- def convert_json_schema(self) -> 'Configuration':
135
- if self.json_schema is not None and not isinstance(self.json_schema, JsonSchema):
136
- if isinstance(self.json_schema, (BaseModel, type)) and issubclass(getattr(self.json_schema, '__class__', type), BaseModel):
152
+ @model_validator(mode="after")
153
+ def convert_json_schema(self) -> "Configuration":
154
+ if self.json_schema is not None and not isinstance(
155
+ self.json_schema, JsonSchema
156
+ ):
157
+ if isinstance(self.json_schema, (BaseModel, type)) and issubclass(
158
+ getattr(self.json_schema, "__class__", type), BaseModel
159
+ ):
137
160
  self.json_schema = JsonSchema(**from_pydantic(self.json_schema))
138
161
  return self
139
162
 
163
+
140
164
  class Status(str, Enum):
141
165
  STARTING = "Starting"
142
166
  PROCESSING = "Processing"
chunkr_ai/api/misc.py CHANGED
@@ -6,68 +6,74 @@ from PIL import Image
6
6
  import requests
7
7
  from typing import Union, Tuple, BinaryIO, Optional
8
8
 
9
- def prepare_file(
10
- file: Union[str, Path, BinaryIO, Image.Image]
11
- ) -> Tuple[str, BinaryIO]:
9
+
10
+ def prepare_file(file: Union[str, Path, BinaryIO, Image.Image]) -> Tuple[str, BinaryIO]:
12
11
  """Convert various file types into a tuple of (filename, file-like object)."""
13
12
  # Handle URLs
14
- if isinstance(file, str) and (file.startswith('http://') or file.startswith('https://')):
13
+ if isinstance(file, str) and (
14
+ file.startswith("http://") or file.startswith("https://")
15
+ ):
15
16
  response = requests.get(file)
16
17
  response.raise_for_status()
17
-
18
+
18
19
  # Try to get filename from Content-Disposition header first
19
20
  filename = None
20
- content_disposition = response.headers.get('Content-Disposition')
21
- if content_disposition and 'filename=' in content_disposition:
22
- filename = content_disposition.split('filename=')[-1].strip('"\'')
23
-
21
+ content_disposition = response.headers.get("Content-Disposition")
22
+ if content_disposition and "filename=" in content_disposition:
23
+ filename = content_disposition.split("filename=")[-1].strip("\"'")
24
+
24
25
  # If no Content-Disposition, try to get clean filename from URL path
25
26
  if not filename:
26
27
  from urllib.parse import urlparse, unquote
28
+
27
29
  parsed_url = urlparse(file)
28
30
  path = unquote(parsed_url.path)
29
31
  filename = Path(path).name if path else None
30
-
32
+
31
33
  # Fallback to default name if we couldn't extract one
32
- filename = filename or 'downloaded_file'
33
-
34
+ filename = filename or "downloaded_file"
35
+
34
36
  # Sanitize filename: remove invalid characters and limit length
35
37
  import re
36
- filename = re.sub(r'[<>:"/\\|?*%]', '_', filename) # Replace invalid chars with underscore
37
- filename = re.sub(r'\s+', '_', filename) # Replace whitespace with underscore
38
- filename = filename.strip('._') # Remove leading/trailing dots and underscores
39
- filename = filename[:255] # Limit length to 255 characters
40
-
38
+
39
+ filename = re.sub(
40
+ r'[<>:"/\\|?*%]', "_", filename
41
+ ) # Replace invalid chars with underscore
42
+ filename = re.sub(r"\s+", "_", filename) # Replace whitespace with underscore
43
+ filename = filename.strip("._") # Remove leading/trailing dots and underscores
44
+ filename = filename[:255] # Limit length to 255 characters
45
+
41
46
  file_obj = io.BytesIO(response.content)
42
47
  return filename, file_obj
43
48
 
44
49
  # Handle base64 strings
45
- if isinstance(file, str) and ',' in file and ';base64,' in file:
50
+ if isinstance(file, str) and "," in file and ";base64," in file:
46
51
  try:
47
52
  # Split header and data
48
- header, base64_data = file.split(',', 1)
53
+ header, base64_data = file.split(",", 1)
49
54
  import base64
55
+
50
56
  file_bytes = base64.b64decode(base64_data)
51
57
  file_obj = io.BytesIO(file_bytes)
52
-
58
+
53
59
  # Try to determine format from header
54
- format = 'bin'
55
- mime_type = header.split(':')[-1].split(';')[0].lower()
56
-
60
+ format = "bin"
61
+ mime_type = header.split(":")[-1].split(";")[0].lower()
62
+
57
63
  # Map MIME types to file extensions
58
64
  mime_to_ext = {
59
- 'application/pdf': 'pdf',
60
- 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
61
- 'application/msword': 'doc',
62
- 'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
63
- 'application/vnd.ms-powerpoint': 'ppt',
64
- 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
65
- 'application/vnd.ms-excel': 'xls',
66
- 'image/jpeg': 'jpg',
67
- 'image/png': 'png',
68
- 'image/jpg': 'jpg'
65
+ "application/pdf": "pdf",
66
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
67
+ "application/msword": "doc",
68
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
69
+ "application/vnd.ms-powerpoint": "ppt",
70
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
71
+ "application/vnd.ms-excel": "xls",
72
+ "image/jpeg": "jpg",
73
+ "image/png": "png",
74
+ "image/jpg": "jpg",
69
75
  }
70
-
76
+
71
77
  if mime_type in mime_to_ext:
72
78
  format = mime_to_ext[mime_type]
73
79
  else:
@@ -82,36 +88,37 @@ def prepare_file(
82
88
  path = Path(file).resolve()
83
89
  if not path.exists():
84
90
  raise FileNotFoundError(f"File not found: {file}")
85
- return path.name, open(path, 'rb')
91
+ return path.name, open(path, "rb")
86
92
 
87
93
  # Handle PIL Images
88
94
  if isinstance(file, Image.Image):
89
95
  img_byte_arr = io.BytesIO()
90
- format = file.format or 'PNG'
96
+ format = file.format or "PNG"
91
97
  file.save(img_byte_arr, format=format)
92
98
  img_byte_arr.seek(0)
93
99
  return f"image.{format.lower()}", img_byte_arr
94
100
 
95
101
  # Handle file-like objects
96
- if hasattr(file, 'read') and hasattr(file, 'seek'):
102
+ if hasattr(file, "read") and hasattr(file, "seek"):
97
103
  # Try to get the filename from the file object if possible
98
- name = getattr(file, 'name', 'document') if hasattr(file, 'name') else 'document'
104
+ name = (
105
+ getattr(file, "name", "document") if hasattr(file, "name") else "document"
106
+ )
99
107
  return Path(name).name, file
100
108
 
101
109
  raise TypeError(f"Unsupported file type: {type(file)}")
102
110
 
103
111
 
104
-
105
112
  def prepare_upload_data(
106
113
  file: Optional[Union[str, Path, BinaryIO, Image.Image]] = None,
107
- config: Optional[Configuration] = None
114
+ config: Optional[Configuration] = None,
108
115
  ) -> dict:
109
116
  """Prepare files and data dictionaries for upload.
110
-
117
+
111
118
  Args:
112
119
  file: The file to upload
113
120
  config: Optional configuration settings
114
-
121
+
115
122
  Returns:
116
123
  dict: (files dict) ready for upload
117
124
  """
@@ -123,6 +130,6 @@ def prepare_upload_data(
123
130
  if config:
124
131
  config_dict = config.model_dump(mode="json", exclude_none=True)
125
132
  for key, value in config_dict.items():
126
- files[key] = (None, json.dumps(value), 'application/json')
127
-
133
+ files[key] = (None, json.dumps(value), "application/json")
134
+
128
135
  return files
chunkr_ai/api/protocol.py CHANGED
@@ -1,14 +1,16 @@
1
- from typing import runtime_checkable, Protocol
1
+ from typing import Optional, runtime_checkable, Protocol
2
2
  from requests import Session
3
3
  from httpx import AsyncClient
4
4
 
5
+
5
6
  @runtime_checkable
6
7
  class ChunkrClientProtocol(Protocol):
7
8
  """Protocol defining the interface for Chunkr clients"""
9
+
8
10
  url: str
9
11
  _api_key: str
10
- _session: Session
11
- _client: AsyncClient
12
+ _session: Optional[Session] = None
13
+ _client: Optional[AsyncClient] = None
12
14
 
13
15
  def get_api_key(self) -> str:
14
16
  """Get the API key"""
@@ -16,4 +18,4 @@ class ChunkrClientProtocol(Protocol):
16
18
 
17
19
  def _headers(self) -> dict:
18
20
  """Return headers required for API requests"""
19
- ...
21
+ ...