chunkr-ai 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
File without changes
chunkr_ai/api/auth.py CHANGED
@@ -1,5 +1,3 @@
1
- from typing import Optional
2
-
3
1
  class HeadersMixin:
4
2
  """Mixin class for handling authorization headers"""
5
3
 
chunkr_ai/api/base.py ADDED
@@ -0,0 +1,173 @@
1
+ from .config import Configuration
2
+ from .task import TaskResponse
3
+ from .auth import HeadersMixin
4
+ from abc import abstractmethod
5
+ from dotenv import load_dotenv
6
+ import io
7
+ import json
8
+ import os
9
+ from pathlib import Path
10
+ from PIL import Image
11
+ import requests
12
+ from typing import BinaryIO, Tuple, Union
13
+
14
+ class ChunkrBase(HeadersMixin):
15
+ """Base class with shared functionality for Chunkr API clients."""
16
+
17
+ def __init__(self, url: str = None, api_key: str = None):
18
+ load_dotenv()
19
+ self.url = (
20
+ url or
21
+ os.getenv('CHUNKR_URL') or
22
+ 'https://api.chunkr.ai'
23
+ )
24
+ self._api_key = (
25
+ api_key or
26
+ os.getenv('CHUNKR_API_KEY')
27
+ )
28
+ if not self._api_key:
29
+ raise ValueError("API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai")
30
+
31
+ self.url = self.url.rstrip("/")
32
+
33
+ def _prepare_file(
34
+ self,
35
+ file: Union[str, Path, BinaryIO, Image.Image]
36
+ ) -> Tuple[str, BinaryIO]:
37
+ """Convert various file types into a tuple of (filename, file-like object).
38
+
39
+ Args:
40
+ file: Input file, can be:
41
+ - String or Path to a file
42
+ - URL string starting with http:// or https://
43
+ - Base64 string
44
+ - Opened binary file (mode='rb')
45
+ - PIL/Pillow Image object
46
+
47
+ Returns:
48
+ Tuple[str, BinaryIO]: (filename, file-like object) ready for upload
49
+
50
+ Raises:
51
+ FileNotFoundError: If the file path doesn't exist
52
+ TypeError: If the file type is not supported
53
+ ValueError: If the URL is invalid or unreachable
54
+ ValueError: If the MIME type is unsupported
55
+ """
56
+ # Handle URLs
57
+ if isinstance(file, str) and (file.startswith('http://') or file.startswith('https://')):
58
+ response = requests.get(file)
59
+ response.raise_for_status()
60
+ file_obj = io.BytesIO(response.content)
61
+ filename = Path(file.split('/')[-1]).name or 'downloaded_file'
62
+ return filename, file_obj
63
+
64
+ # Handle base64 strings
65
+ if isinstance(file, str) and ',' in file and ';base64,' in file:
66
+ try:
67
+ # Split header and data
68
+ header, base64_data = file.split(',', 1)
69
+ import base64
70
+ file_bytes = base64.b64decode(base64_data)
71
+ file_obj = io.BytesIO(file_bytes)
72
+
73
+ # Try to determine format from header
74
+ format = 'bin'
75
+ mime_type = header.split(':')[-1].split(';')[0].lower()
76
+
77
+ # Map MIME types to file extensions
78
+ mime_to_ext = {
79
+ 'application/pdf': 'pdf',
80
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
81
+ 'application/msword': 'doc',
82
+ 'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
83
+ 'application/vnd.ms-powerpoint': 'ppt',
84
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
85
+ 'application/vnd.ms-excel': 'xls',
86
+ 'image/jpeg': 'jpg',
87
+ 'image/png': 'png',
88
+ 'image/jpg': 'jpg'
89
+ }
90
+
91
+ if mime_type in mime_to_ext:
92
+ format = mime_to_ext[mime_type]
93
+ else:
94
+ raise ValueError(f"Unsupported MIME type: {mime_type}")
95
+
96
+ return f"file.{format}", file_obj
97
+ except Exception as e:
98
+ raise ValueError(f"Invalid base64 string: {str(e)}")
99
+
100
+ # Handle file paths
101
+ if isinstance(file, (str, Path)):
102
+ path = Path(file).resolve()
103
+ if not path.exists():
104
+ raise FileNotFoundError(f"File not found: {file}")
105
+ return path.name, open(path, 'rb')
106
+
107
+ # Handle PIL Images
108
+ if isinstance(file, Image.Image):
109
+ img_byte_arr = io.BytesIO()
110
+ format = file.format or 'PNG'
111
+ file.save(img_byte_arr, format=format)
112
+ img_byte_arr.seek(0)
113
+ return f"image.{format.lower()}", img_byte_arr
114
+
115
+ # Handle file-like objects
116
+ if hasattr(file, 'read') and hasattr(file, 'seek'):
117
+ # Try to get the filename from the file object if possible
118
+ name = getattr(file, 'name', 'document') if hasattr(file, 'name') else 'document'
119
+ return Path(name).name, file
120
+
121
+ raise TypeError(f"Unsupported file type: {type(file)}")
122
+
123
+ def _prepare_upload_data(
124
+ self,
125
+ file: Union[str, Path, BinaryIO, Image.Image],
126
+ config: Configuration = None
127
+ ) -> Tuple[dict, dict]:
128
+ """Prepare files and data dictionaries for upload.
129
+
130
+ Args:
131
+ file: The file to upload
132
+ config: Optional configuration settings
133
+
134
+ Returns:
135
+ Tuple[dict, dict]: (files dict, data dict) ready for upload
136
+ """
137
+ filename, file_obj = self._prepare_file(file)
138
+ files = {"file": (filename, file_obj)}
139
+ data = {}
140
+
141
+ if config:
142
+ config_dict = config.model_dump(mode="json", exclude_none=True)
143
+ for key, value in config_dict.items():
144
+ if isinstance(value, dict):
145
+ files[key] = (None, json.dumps(value), 'application/json')
146
+ else:
147
+ data[key] = value
148
+
149
+ return files, data
150
+
151
+ @abstractmethod
152
+ def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
153
+ """Upload a file and wait for processing to complete.
154
+
155
+ Must be implemented by subclasses.
156
+ """
157
+ pass
158
+
159
+ @abstractmethod
160
+ def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
161
+ """Upload a file for processing and immediately return the task response.
162
+
163
+ Must be implemented by subclasses.
164
+ """
165
+ pass
166
+
167
+ @abstractmethod
168
+ def get_task(self, task_id: str) -> TaskResponse:
169
+ """Get a task response by its ID.
170
+
171
+ Must be implemented by subclasses.
172
+ """
173
+ pass
chunkr_ai/api/chunkr.py CHANGED
@@ -1,125 +1,108 @@
1
- from .models import TaskResponse, Configuration
2
- from .auth import HeadersMixin
3
- from dotenv import load_dotenv
4
- import io
5
- import os
1
+ from .base import ChunkrBase
2
+ from .config import Configuration
3
+ from .task import TaskResponse
6
4
  from pathlib import Path
7
5
  from PIL import Image
8
6
  import requests
9
- from typing import Union, BinaryIO, Tuple
7
+ from typing import Union, BinaryIO
10
8
 
11
- class Chunkr(HeadersMixin):
12
- """Client for interacting with the Chunkr API."""
9
+ class Chunkr(ChunkrBase):
10
+ """Chunkr API client"""
13
11
 
14
12
  def __init__(self, url: str = None, api_key: str = None):
15
- load_dotenv()
16
- self.url = (
17
- url or
18
- os.getenv('CHUNKR_URL') or
19
- 'https://api.chunkr.ai'
20
- )
21
- self._api_key = (
22
- api_key or
23
- os.getenv('CHUNKR_API_KEY')
24
- )
25
- if not self._api_key:
26
- raise ValueError("API key must be provided either directly, in .env file, or as CHUNKR_API_KEY environment variable. You can get an api key at: https://www.chunkr.ai")
27
-
28
- self.url = self.url.rstrip("/")
13
+ super().__init__(url, api_key)
14
+ self._session = requests.Session()
29
15
 
30
- def _prepare_file(
31
- self,
32
- file: Union[str, BinaryIO, Image.Image, bytes, io.BytesIO]
33
- ) -> Tuple[str, BinaryIO]:
34
- """Convert various file types into a tuple of (filename, file-like object).
16
+ def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
17
+ """Upload a file and wait for processing to complete.
35
18
 
36
19
  Args:
37
- file: Input file in various formats
20
+ file: The file to upload.
21
+ config: Configuration options for processing. Optional.
38
22
 
39
- Returns:
40
- Tuple[str, BinaryIO]: Filename and file-like object ready for upload
41
- """
42
- if isinstance(file, str):
43
- path = Path(file).resolve()
44
- if not path.exists():
45
- raise FileNotFoundError(f"File not found: {file}")
46
- return path.name, path.open("rb")
47
- elif isinstance(file, Image.Image):
48
- img_byte_arr = io.BytesIO()
49
- file.save(img_byte_arr, format=file.format or 'PNG')
50
- img_byte_arr.seek(0)
51
- return "image.png", img_byte_arr
52
- elif isinstance(file, bytes):
53
- return "document", io.BytesIO(file)
54
- elif isinstance(file, io.BytesIO):
55
- return "document", file
56
- else:
57
- return "document", file
58
-
59
- def upload(self, file: Union[str, BinaryIO, Image.Image, bytes, io.BytesIO], config: Configuration = None) -> TaskResponse:
60
- """Upload a file and wait for processing to complete.
23
+ Examples:
24
+ ```
25
+ # Upload from file path
26
+ chunkr.upload("document.pdf")
61
27
 
62
- The file can be one of:
63
- - str: Path to a file on disk
64
- - BinaryIO: A file-like object (e.g., opened with 'rb' mode)
65
- - Image.Image: A PIL/Pillow Image object
66
- - bytes: Raw binary data
67
- - io.BytesIO: A binary stream in memory
28
+ # Upload from URL
29
+ chunkr.upload("https://example.com/document.pdf")
68
30
 
69
- Args:
70
- file: The file to upload.
71
- config:
72
- Configuration options for processing. Optional.
31
+ # Upload from base64 string (must include MIME type header)
32
+ chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
73
33
 
34
+ # Upload from opened file
35
+ with open("document.pdf", "rb") as f:
36
+ chunkr.upload(f)
37
+
38
+ # Upload an image
39
+ from PIL import Image
40
+ img = Image.open("photo.jpg")
41
+ chunkr.upload(img)
42
+ ```
74
43
  Returns:
75
44
  TaskResponse: The completed task response
76
45
  """
77
- return self.start_upload(file, config).poll()
46
+ task = self.start_upload(file, config)
47
+ return task.poll()
78
48
 
79
- def start_upload(self, file: Union[str, BinaryIO, Image.Image, bytes, io.BytesIO], config: Configuration = None) -> TaskResponse:
49
+ def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
80
50
  """Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll()`
81
51
 
82
- The file can be one of:
83
- - str: Path to a file on disk
84
- - BinaryIO: A file-like object (e.g., opened with 'rb' mode)
85
- - Image.Image: A PIL/Pillow Image object
86
- - bytes: Raw binary data
87
- - io.BytesIO: A binary stream in memory
88
-
89
52
  Args:
90
53
  file: The file to upload.
91
- config (Configuration, optional): Configuration options for processing
54
+ config: Configuration options for processing. Optional.
55
+
56
+ Examples:
57
+ ```
58
+ # Upload from file path
59
+ task = chunkr.start_upload("document.pdf")
60
+
61
+ # Upload from opened file
62
+ with open("document.pdf", "rb") as f:
63
+ task = chunkr.start_upload(f)
64
+
65
+ # Upload from URL
66
+ task = chunkr.start_upload("https://example.com/document.pdf")
67
+
68
+ # Upload from base64 string (must include MIME type header)
69
+ task = chunkr.start_upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
70
+
71
+ # Upload an image
72
+ from PIL import Image
73
+ img = Image.open("photo.jpg")
74
+ task = chunkr.start_upload(img)
75
+
76
+ # Wait for the task to complete - this can be done when needed
77
+ task.poll()
78
+ ```
92
79
 
93
80
  Returns:
94
81
  TaskResponse: The initial task response
95
-
96
- Raises:
97
- requests.exceptions.HTTPError: If the API request fails
98
82
  """
99
- url = f"{self.url}/api/v1/task"
100
- filename, file_obj = self._prepare_file(file)
101
-
102
- files = {"file": (filename, file_obj)}
103
- r = requests.post(
104
- url,
105
- files=files,
106
- json=config.dict() if config else {},
83
+ files, data = self._prepare_upload_data(file, config)
84
+ r = self._session.post(
85
+ f"{self.url}/api/v1/task",
86
+ files=files,
87
+ data=data,
107
88
  headers=self._headers()
108
89
  )
109
90
  r.raise_for_status()
110
- return TaskResponse(**r.json()).with_api_key(self._api_key)
91
+ return TaskResponse(**r.json()).with_client(self)
111
92
 
112
93
  def get_task(self, task_id: str) -> TaskResponse:
113
94
  """Get a task response by its ID.
114
95
 
115
96
  Args:
116
- task_id (str): The ID of the task to get
97
+ task_id: The ID of the task to get
117
98
 
118
99
  Returns:
119
100
  TaskResponse: The task response
120
101
  """
121
- url = f"{self.url}/api/v1/task/{task_id}"
122
- r = requests.get(url, headers=self._headers())
102
+ r = self._session.get(
103
+ f"{self.url}/api/v1/task/{task_id}",
104
+ headers=self._headers()
105
+ )
123
106
  r.raise_for_status()
124
- return TaskResponse(**r.json()).with_api_key(self._api_key)
107
+ return TaskResponse(**r.json()).with_client(self)
125
108
 
@@ -1,39 +1,105 @@
1
- from .chunkr import Chunkr
2
- from .models import TaskResponse, Configuration
1
+ from .base import ChunkrBase
2
+ from .task import TaskResponse
3
+ from .config import Configuration
3
4
  import httpx
4
- import io
5
+ from pathlib import Path
5
6
  from PIL import Image
6
7
  from typing import Union, BinaryIO
7
8
 
8
- class ChunkrAsync(Chunkr):
9
- """Async client for interacting with the Chunkr API.
9
+ class ChunkrAsync(ChunkrBase):
10
+ """Asynchronous Chunkr API client"""
10
11
 
11
- This class inherits from the Chunkr class but works with async HTTP requests.
12
- """
12
+ def __init__(self, url: str = None, api_key: str = None):
13
+ super().__init__(url, api_key)
14
+ self._client = httpx.AsyncClient()
13
15
 
14
- async def upload(self, file: Union[str, BinaryIO, Image.Image, bytes, io.BytesIO], config: Configuration = None) -> TaskResponse:
16
+ async def upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
17
+ """Upload a file and wait for processing to complete.
18
+
19
+ Args:
20
+ file: The file to upload.
21
+ config: Configuration options for processing. Optional.
22
+
23
+ Examples:
24
+ ```python
25
+ # Upload from file path
26
+ await chunkr.upload("document.pdf")
27
+
28
+ # Upload from opened file
29
+ with open("document.pdf", "rb") as f:
30
+ await chunkr.upload(f)
31
+
32
+ # Upload from URL
33
+ await chunkr.upload("https://example.com/document.pdf")
34
+
35
+ # Upload from base64 string (must include MIME type header)
36
+ await chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
37
+
38
+ # Upload an image
39
+ from PIL import Image
40
+ img = Image.open("photo.jpg")
41
+ await chunkr.upload(img)
42
+ ```
43
+ Returns:
44
+ TaskResponse: The completed task response
45
+ """
15
46
  task = await self.start_upload(file, config)
16
47
  return await task.poll_async()
17
48
 
18
- async def start_upload(self, file: Union[str, BinaryIO, Image.Image, bytes, io.BytesIO], config: Configuration = None) -> TaskResponse:
19
- url = f"{self.url}/api/v1/task"
20
- filename, file_obj = self._prepare_file(file)
21
- async with httpx.AsyncClient() as client:
22
- files = {"file": (filename, file_obj)}
23
- r = await client.post(
24
- url,
25
- files=files,
26
- json=config.dict() if config else {},
27
- headers=self._headers()
28
- )
29
- r.raise_for_status()
30
- return TaskResponse(**r.json()).with_api_key(self._api_key)
49
+ async def start_upload(self, file: Union[str, Path, BinaryIO, Image.Image], config: Configuration = None) -> TaskResponse:
50
+ """Upload a file for processing and immediately return the task response. It will not wait for processing to complete. To wait for the full processing to complete, use `task.poll_async()`.
51
+
52
+ Args:
53
+ file: The file to upload.
54
+ config: Configuration options for processing. Optional.
55
+
56
+ Examples:
57
+ ```
58
+ # Upload from file path
59
+ task = await chunkr.start_upload("document.pdf")
60
+
61
+ # Upload from opened file
62
+ with open("document.pdf", "rb") as f:
63
+ task = await chunkr.start_upload(f)
64
+
65
+ # Upload from URL
66
+ task = await chunkr.start_upload("https://example.com/document.pdf")
67
+
68
+ # Upload from base64 string (must include MIME type header)
69
+ task = await chunkr.start_upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
70
+
71
+ # Upload an image
72
+ from PIL import Image
73
+ img = Image.open("photo.jpg")
74
+ task = await chunkr.start_upload(img)
75
+
76
+ # Wait for the task to complete - this can be done when needed
77
+ await task.poll_async()
78
+ ```
79
+
80
+ Returns:
81
+ TaskResponse: The initial task response
82
+ """
83
+ files, data = self._prepare_upload_data(file, config)
84
+ r = await self._client.post(
85
+ f"{self.url}/api/v1/task",
86
+ files=files,
87
+ json=config.model_dump() if config else {},
88
+ headers=self._headers()
89
+ )
90
+ r.raise_for_status()
91
+ return TaskResponse(**r.json()).with_client(self)
31
92
 
32
93
  async def get_task(self, task_id: str) -> TaskResponse:
33
- url = f"{self.url}/api/v1/task/{task_id}"
34
- async with httpx.AsyncClient() as client:
35
- r = await client.get(url, headers=self._headers())
36
- r.raise_for_status()
37
- return TaskResponse(**r.json()).with_api_key(self._api_key)
94
+ r = await self._client.get(
95
+ f"{self.url}/api/v1/task/{task_id}",
96
+ headers=self._headers()
97
+ )
98
+ r.raise_for_status()
99
+ return TaskResponse(**r.json()).with_client(self)
100
+
101
+ async def __aenter__(self):
102
+ return self
38
103
 
39
-
104
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
105
+ await self._client.aclose()
@@ -0,0 +1,131 @@
1
+ from pydantic import BaseModel, Field, model_validator
2
+ from enum import Enum
3
+ from typing import Optional, List, Dict
4
+
5
+ class GenerationStrategy(str, Enum):
6
+ LLM = "LLM"
7
+ AUTO = "Auto"
8
+
9
+ class CroppingStrategy(str, Enum):
10
+ ALL = "All"
11
+ AUTO = "Auto"
12
+
13
+ class LlmConfig(BaseModel):
14
+ model: str
15
+ prompt: str
16
+ temperature: float = 0.0
17
+
18
+ class GenerationConfig(BaseModel):
19
+ html: Optional[GenerationStrategy] = None
20
+ llm: Optional[LlmConfig] = None
21
+ markdown: Optional[GenerationStrategy] = None
22
+ crop_image: Optional[CroppingStrategy] = None
23
+
24
+ class SegmentProcessing(BaseModel):
25
+ title: Optional[GenerationConfig] = None
26
+ section_header: Optional[GenerationConfig] = None
27
+ text: Optional[GenerationConfig] = None
28
+ list_item: Optional[GenerationConfig] = None
29
+ table: Optional[GenerationConfig] = None
30
+ picture: Optional[GenerationConfig] = None
31
+ caption: Optional[GenerationConfig] = None
32
+ formula: Optional[GenerationConfig] = None
33
+ footnote: Optional[GenerationConfig] = None
34
+ page_header: Optional[GenerationConfig] = None
35
+ page_footer: Optional[GenerationConfig] = None
36
+ page: Optional[GenerationConfig] = None
37
+
38
+ class ChunkProcessing(BaseModel):
39
+ target_length: Optional[int] = None
40
+
41
+ class Property(BaseModel):
42
+ name: str
43
+ title: Optional[str] = None
44
+ prop_type: str
45
+ description: Optional[str] = None
46
+ default: Optional[str] = None
47
+
48
+ class JsonSchema(BaseModel):
49
+ title: str
50
+ properties: List[Property]
51
+
52
+ class OcrStrategy(str, Enum):
53
+ ALL = "All"
54
+ AUTO = "Auto"
55
+
56
+ class SegmentationStrategy(str, Enum):
57
+ LAYOUT_ANALYSIS = "LayoutAnalysis"
58
+ PAGE = "Page"
59
+
60
+ class BoundingBox(BaseModel):
61
+ left: float
62
+ top: float
63
+ width: float
64
+ height: float
65
+
66
+ class OCRResult(BaseModel):
67
+ bbox: BoundingBox
68
+ text: str
69
+ confidence: Optional[float]
70
+
71
+ class SegmentType(str, Enum):
72
+ CAPTION = "Caption"
73
+ FOOTNOTE = "Footnote"
74
+ FORMULA = "Formula"
75
+ LIST_ITEM = "ListItem"
76
+ PAGE = "Page"
77
+ PAGE_FOOTER = "PageFooter"
78
+ PAGE_HEADER = "PageHeader"
79
+ PICTURE = "Picture"
80
+ SECTION_HEADER = "SectionHeader"
81
+ TABLE = "Table"
82
+ TEXT = "Text"
83
+ TITLE = "Title"
84
+
85
+ class Segment(BaseModel):
86
+ bbox: BoundingBox
87
+ content: str
88
+ page_height: float
89
+ html: Optional[str]
90
+ image: Optional[str]
91
+ markdown: Optional[str]
92
+ ocr: List[OCRResult]
93
+ page_number: int
94
+ page_width: float
95
+ segment_id: str
96
+ segment_type: SegmentType
97
+
98
+ class Chunk(BaseModel):
99
+ chunk_id: str
100
+ chunk_length: int
101
+ segments: List[Segment]
102
+
103
+ class ExtractedJson(BaseModel):
104
+ data: Dict
105
+
106
+ class OutputResponse(BaseModel):
107
+ chunks: List[Chunk] = []
108
+ extracted_json: Optional[ExtractedJson]
109
+
110
+ class Model(str, Enum):
111
+ FAST = "Fast"
112
+ HIGH_QUALITY = "HighQuality"
113
+
114
+ class Configuration(BaseModel):
115
+ chunk_processing: Optional[ChunkProcessing] = Field(default=None)
116
+ expires_in: Optional[int] = Field(default=None)
117
+ high_resolution: Optional[bool] = Field(default=None)
118
+ json_schema: Optional[JsonSchema] = Field(default=None)
119
+ model: Optional[Model] = Field(default=None)
120
+ ocr_strategy: Optional[OcrStrategy] = Field(default=None)
121
+ segment_processing: Optional[SegmentProcessing] = Field(default=None)
122
+ segmentation_strategy: Optional[SegmentationStrategy] = Field(default=None)
123
+
124
+ @model_validator(mode='before')
125
+ def map_deprecated_fields(cls, values: Dict) -> Dict:
126
+ if isinstance(values, dict) and "target_chunk_length" in values:
127
+ target_length = values.pop("target_chunk_length")
128
+ if target_length is not None:
129
+ values["chunk_processing"] = values.get("chunk_processing", {}) or {}
130
+ values["chunk_processing"]["target_length"] = target_length
131
+ return values
@@ -0,0 +1,19 @@
1
+ from typing import runtime_checkable, Protocol
2
+ from requests import Session
3
+ from httpx import AsyncClient
4
+
5
+ @runtime_checkable
6
+ class ChunkrClientProtocol(Protocol):
7
+ """Protocol defining the interface for Chunkr clients"""
8
+ url: str
9
+ _api_key: str
10
+ _session: Session
11
+ _client: AsyncClient
12
+
13
+ def get_api_key(self) -> str:
14
+ """Get the API key"""
15
+ ...
16
+
17
+ def _headers(self) -> dict:
18
+ """Return headers required for API requests"""
19
+ ...
chunkr_ai/api/task.py ADDED
@@ -0,0 +1,131 @@
1
+ from .protocol import ChunkrClientProtocol
2
+ from .config import Configuration, OutputResponse
3
+ import asyncio
4
+ from datetime import datetime
5
+ from enum import Enum
6
+ from pydantic import BaseModel, PrivateAttr
7
+ import time
8
+ from typing import Optional, Union
9
+
10
+ class Status(str, Enum):
11
+ STARTING = "Starting"
12
+ PROCESSING = "Processing"
13
+ SUCCEEDED = "Succeeded"
14
+ FAILED = "Failed"
15
+
16
+ class TaskResponse(BaseModel):
17
+ configuration: Configuration
18
+ created_at: datetime
19
+ expires_at: Optional[datetime]
20
+ file_name: Optional[str]
21
+ finished_at: Optional[datetime]
22
+ input_file_url: Optional[str]
23
+ message: str
24
+ output: Optional[OutputResponse]
25
+ page_count: Optional[int]
26
+ pdf_url: Optional[str]
27
+ status: Status
28
+ task_id: str
29
+ task_url: Optional[str]
30
+ _client: Optional[Union[ChunkrClientProtocol]] = PrivateAttr(default=None)
31
+
32
+ def with_client(self, client: Union[ChunkrClientProtocol]) -> 'TaskResponse':
33
+ self._client = client
34
+ return self
35
+
36
+ def _poll_request_sync(self) -> dict:
37
+ """Helper method to make polling request with retry logic (synchronous)"""
38
+ if not self.task_url:
39
+ raise ValueError("Task URL not found in response")
40
+
41
+ while True:
42
+ try:
43
+ r = self._client._session.get(self.task_url, headers=self._client._headers())
44
+ r.raise_for_status()
45
+ return r.json()
46
+ except (ConnectionError, TimeoutError) as _:
47
+ print("Connection error while polling the task, retrying...")
48
+ time.sleep(0.5)
49
+ except Exception as e:
50
+ raise
51
+
52
+ async def _poll_request_async(self) -> dict:
53
+ """Helper method to make polling request with retry logic (asynchronous)"""
54
+ if not self.task_url:
55
+ raise ValueError("Task URL not found in response")
56
+
57
+ while True:
58
+ try:
59
+ r = await self._client._client.get(self.task_url, headers=self._client._headers())
60
+ await r.raise_for_status()
61
+ return await r.json()
62
+ except (ConnectionError, TimeoutError) as _:
63
+ print("Connection error while polling the task, retrying...")
64
+ await asyncio.sleep(0.5)
65
+ except Exception as e:
66
+ raise
67
+
68
+ def _check_status(self) -> Optional['TaskResponse']:
69
+ """Helper method to check task status and handle completion/failure"""
70
+ if self.status == "Failed":
71
+ raise ValueError(self.message)
72
+ if self.status not in ("Starting", "Processing"):
73
+ return self
74
+ return None
75
+
76
+ def poll(self) -> 'TaskResponse':
77
+ """Poll the task for completion."""
78
+ while True:
79
+ response = self._poll_request_sync()
80
+ self.__dict__.update(response)
81
+
82
+ if result := self._check_status():
83
+ return result
84
+
85
+ time.sleep(0.5)
86
+
87
+ async def poll_async(self) -> 'TaskResponse':
88
+ """Poll the task for completion asynchronously."""
89
+ while True:
90
+ response = await self._poll_request_async()
91
+ self.__dict__.update(response)
92
+
93
+ if result := self._check_status():
94
+ return result
95
+
96
+ await asyncio.sleep(0.5)
97
+
98
+ def _get_content(self, content_type: str) -> str:
99
+ """Helper method to get either HTML, Markdown, or raw content."""
100
+ if not self.output:
101
+ return ""
102
+ parts = []
103
+ for c in self.output.chunks:
104
+ for s in c.segments:
105
+ content = getattr(s, content_type)
106
+ if content:
107
+ parts.append(content)
108
+ return "\n".join(parts)
109
+
110
+ def html(self) -> str:
111
+ """Get full HTML for the task"""
112
+ return self._get_content("html")
113
+
114
+ def markdown(self) -> str:
115
+ """Get full markdown for the task"""
116
+ return self._get_content("markdown")
117
+
118
+ def content(self) -> str:
119
+ """Get full text for the task"""
120
+ return self._get_content("content")
121
+
122
+ class TaskPayload(BaseModel):
123
+ current_configuration: Configuration
124
+ file_name: str
125
+ image_folder_location: str
126
+ input_location: str
127
+ output_location: str
128
+ pdf_location: str
129
+ previous_configuration: Optional[Configuration]
130
+ task_id: str
131
+ user_id: str
chunkr_ai/models.py ADDED
@@ -0,0 +1,48 @@
1
+ from .api.config import (
2
+ BoundingBox,
3
+ Chunk,
4
+ ChunkProcessing,
5
+ Configuration,
6
+ CroppingStrategy,
7
+ ExtractedJson,
8
+ GenerationStrategy,
9
+ GenerationConfig,
10
+ JsonSchema,
11
+ LlmConfig,
12
+ Model,
13
+ OCRResult,
14
+ OcrStrategy,
15
+ OutputResponse,
16
+ Property,
17
+ Segment,
18
+ SegmentProcessing,
19
+ SegmentType,
20
+ SegmentationStrategy,
21
+ )
22
+
23
+ from .api.task import TaskResponse, TaskPayload, Status
24
+
25
+ __all__ = [
26
+ 'BoundingBox',
27
+ 'Chunk',
28
+ 'ChunkProcessing',
29
+ 'Configuration',
30
+ 'CroppingStrategy',
31
+ 'ExtractedJson',
32
+ 'GenerationConfig',
33
+ 'GenerationStrategy',
34
+ 'JsonSchema',
35
+ 'LlmConfig',
36
+ 'Model',
37
+ 'OCRResult',
38
+ 'OcrStrategy',
39
+ 'OutputResponse',
40
+ 'Property',
41
+ 'Segment',
42
+ 'SegmentProcessing',
43
+ 'SegmentType',
44
+ 'SegmentationStrategy',
45
+ 'Status',
46
+ 'TaskPayload',
47
+ 'TaskResponse'
48
+ ]
@@ -0,0 +1,204 @@
1
+ Metadata-Version: 2.2
2
+ Name: chunkr-ai
3
+ Version: 0.0.4
4
+ Summary: Python client for Chunkr: open source document intelligence
5
+ Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
+ Project-URL: Homepage, https://chunkr.ai
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: httpx>=0.28.1
10
+ Requires-Dist: pillow>=11.1.0
11
+ Requires-Dist: pydantic>=2.10.4
12
+ Requires-Dist: python-dotenv>=1.0.1
13
+ Requires-Dist: requests>=2.32.3
14
+ Provides-Extra: test
15
+ Requires-Dist: pytest>=8.3.4; extra == "test"
16
+ Requires-Dist: pytest-xdist>=3.6.1; extra == "test"
17
+
18
+ # Chunkr Python Client
19
+
20
+ This provides a simple interface to interact with the Chunkr API.
21
+
22
+ ## Getting Started
23
+
24
+ You can get an API key from [Chunkr](https://chunkr.ai) or deploy your own Chunkr instance. For self-hosted deployment options, check out our [deployment guide](https://github.com/lumina-ai-inc/chunkr/tree/main?tab=readme-ov-file#self-hosted-deployment-options).
25
+
26
+ For more information about the API and its capabilities, visit the [Chunkr API docs](https://docs.chunkr.ai).
27
+
28
+ ## Installation
29
+
30
+ ```bash
31
+ pip install chunkr-ai
32
+ ```
33
+
34
+ ## Usage
35
+
36
+ We provide two clients: `Chunkr` for synchronous operations and `ChunkrAsync` for asynchronous operations.
37
+
38
+ ### Synchronous Usage
39
+
40
+ ```python
41
+ from chunkr_ai import Chunkr
42
+
43
+ # Initialize client
44
+ chunkr = Chunkr()
45
+
46
+ # Upload a file and wait for processing
47
+ task = chunkr.upload("document.pdf")
48
+
49
+ # Print the response
50
+ print(task)
51
+
52
+ # Get output from task
53
+ output = task.output
54
+
55
+ # If you want to upload without waiting for processing
56
+ task = chunkr.start_upload("document.pdf")
57
+ # ... do other things ...
58
+ task.poll() # Check status when needed
59
+ ```
60
+
61
+ ### Asynchronous Usage
62
+
63
+ ```python
64
+ from chunkr_ai import ChunkrAsync
65
+
66
+ async def process_document():
67
+ # Initialize client
68
+ chunkr = ChunkrAsync()
69
+
70
+ # Upload a file and wait for processing
71
+ task = await chunkr.upload("document.pdf")
72
+
73
+ # Print the response
74
+ print(task)
75
+
76
+ # Get output from task
77
+ output = task.output
78
+
79
+ # If you want to upload without waiting for processing
80
+ task = await chunkr.start_upload("document.pdf")
81
+ # ... do other things ...
82
+ await task.poll_async() # Check status when needed
83
+ ```
84
+
85
+ ### Additional Features
86
+
87
+ Both clients support various input types:
88
+
89
+ ```python
90
+ # Upload from file path
91
+ chunkr.upload("document.pdf")
92
+
93
+ # Upload from opened file
94
+ with open("document.pdf", "rb") as f:
95
+ chunkr.upload(f)
96
+
97
+ # Upload from URL
98
+ chunkr.upload("https://example.com/document.pdf")
99
+
100
+ # Upload from base64 string
101
+ chunkr.upload("data:application/pdf;base64,JVBERi0xLjcKCjEgMCBvYmo...")
102
+
103
+ # Upload an image
104
+ from PIL import Image
105
+ img = Image.open("photo.jpg")
106
+ chunkr.upload(img)
107
+ ```
108
+
109
+ ### Configuration
110
+
111
+ You can customize the processing behavior by passing a `Configuration` object:
112
+
113
+ ```python
114
+ from chunkr_ai.models import Configuration, OcrStrategy, SegmentationStrategy, GenerationStrategy
115
+
116
+ # Basic configuration
117
+ config = Configuration(
118
+ ocr_strategy=OcrStrategy.AUTO,
119
+ segmentation_strategy=SegmentationStrategy.LAYOUT_ANALYSIS,
120
+ high_resolution=True,
121
+ expires_in=3600, # seconds
122
+ )
123
+
124
+ # Upload with configuration
125
+ task = chunkr.upload("document.pdf", config)
126
+ ```
127
+
128
+ #### Available Configuration Examples
129
+
130
+ - **Chunk Processing**
131
+ ```python
132
+ from chunkr_ai.models import ChunkProcessing
133
+ config = Configuration(
134
+ chunk_processing=ChunkProcessing(target_length=1024)
135
+ )
136
+ ```
137
+ - **Expires In**
138
+ ```python
139
+ config = Configuration(expires_in=3600)
140
+ ```
141
+
142
+ - **High Resolution**
143
+ ```python
144
+ config = Configuration(high_resolution=True)
145
+ ```
146
+
147
+ - **JSON Schema**
148
+ ```python
149
+ config = Configuration(json_schema=JsonSchema(
150
+ title="Sales Data",
151
+ properties=[
152
+ Property(name="Person with highest sales", prop_type="string", description="The person with the highest sales"),
153
+ Property(name="Person with lowest sales", prop_type="string", description="The person with the lowest sales"),
154
+ ]
155
+ ))
156
+ ```
157
+
158
+ - **OCR Strategy**
159
+ ```python
160
+ config = Configuration(ocr_strategy=OcrStrategy.AUTO)
161
+ ```
162
+
163
+ - **Segment Processing**
164
+ ```python
165
+ from chunkr_ai.models import SegmentProcessing, GenerationConfig, GenerationStrategy
166
+ config = Configuration(
167
+ segment_processing=SegmentProcessing(
168
+ page=GenerationConfig(
169
+ html=GenerationStrategy.LLM,
170
+ markdown=GenerationStrategy.LLM
171
+ )
172
+ )
173
+ )
174
+ ```
175
+
176
+ - **Segmentation Strategy**
177
+ ```python
178
+ config = Configuration(
179
+ segmentation_strategy=SegmentationStrategy.LAYOUT_ANALYSIS # or SegmentationStrategy.PAGE
180
+ )
181
+ ```
182
+
183
+ ## Environment setup
184
+
185
+ You can provide your API key and URL in several ways:
186
+ 1. Environment variables: `CHUNKR_API_KEY` and `CHUNKR_URL`
187
+ 2. `.env` file
188
+ 3. Direct initialization:
189
+ ```python
190
+ chunkr = Chunkr(
191
+ api_key="your-api-key",
192
+ url="https://api.chunkr.ai"
193
+ )
194
+ ```
195
+
196
+ ## Run tests
197
+
198
+ ```python
199
+ # Install dependencies
200
+ uv pip install -e ".[test]"
201
+
202
+ # Run tests
203
+ uv run pytest
204
+ ```
@@ -0,0 +1,17 @@
1
+ chunkr_ai/__init__.py,sha256=eXygrEhGxxIHXNYIlHF2eied8rGsx2RphgR8Wo4lRyo,110
2
+ chunkr_ai/main.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ chunkr_ai/models.py,sha256=d-B4vfgZClJOoHdPaH3vagwUc4qxeQSmUxab77DKYtQ,874
4
+ chunkr_ai/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ chunkr_ai/api/api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ chunkr_ai/api/auth.py,sha256=iSd5Jek2BFaHGw9HY-RrqgwP56BHFU0xbSuJS4fU6AA,425
7
+ chunkr_ai/api/base.py,sha256=WDHx8tU0fl9_-yvYTKL-U0uaxHv-8_bRfiw9Xkl-mWM,6499
8
+ chunkr_ai/api/chunkr.py,sha256=LkBFzGB_T0y3fnBeIn_nwQW6Mb7eZO-iTlzWrmWBoko,3450
9
+ chunkr_ai/api/chunkr_async.py,sha256=B9deRVoe4h3Csh_jEuQxuxQ-DKSuZPdwkanFTyfHmeM,3603
10
+ chunkr_ai/api/config.py,sha256=K0s1giImciPksu-bO9gzRwUaK2Vo1nxNKQkXlRQ2cb8,3785
11
+ chunkr_ai/api/protocol.py,sha256=XKS9RmtvBpJItYhPg18qlOCKpaSHdOuQTRSUxAdUz2g,479
12
+ chunkr_ai/api/task.py,sha256=ALU-rYlObbitlM1MKEFeSz_IBUpzb9736Iqu9huWg7c,4392
13
+ chunkr_ai-0.0.4.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ chunkr_ai-0.0.4.dist-info/METADATA,sha256=7k2zij-F7_Kcs6nFCJMKQW382gFpOOLAnZoOOXFrKFs,4913
15
+ chunkr_ai-0.0.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
16
+ chunkr_ai-0.0.4.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
17
+ chunkr_ai-0.0.4.dist-info/RECORD,,
chunkr_ai/api/models.py DELETED
@@ -1,231 +0,0 @@
1
- from .auth import HeadersMixin
2
- import asyncio
3
- from datetime import datetime
4
- from enum import Enum
5
- import httpx
6
- from pydantic import BaseModel, Field, PrivateAttr
7
- import requests
8
- import time
9
- from typing import Optional, List, Dict, Union
10
-
11
- class GenerationStrategy(str, Enum):
12
- LLM = "LLM"
13
- AUTO = "Auto"
14
-
15
- class CroppingStrategy(str, Enum):
16
- ALL = "All"
17
- AUTO = "Auto"
18
-
19
- class LlmConfig(BaseModel):
20
- model: str
21
- prompt: str
22
- temperature: float = 0.0
23
-
24
- class AutoGenerationConfig(BaseModel):
25
- html: GenerationStrategy = GenerationStrategy.AUTO
26
- llm: Optional[LlmConfig] = None
27
- markdown: GenerationStrategy = GenerationStrategy.AUTO
28
- crop_image: CroppingStrategy = CroppingStrategy.ALL
29
-
30
- class LlmGenerationConfig(BaseModel):
31
- html: GenerationStrategy = GenerationStrategy.LLM
32
- llm: Optional[LlmConfig] = None
33
- markdown: GenerationStrategy = GenerationStrategy.LLM
34
- crop_image: CroppingStrategy = CroppingStrategy.ALL
35
-
36
- class SegmentProcessing(BaseModel):
37
- title: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
38
- section_header: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
39
- text: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
40
- list_item: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
41
- table: LlmGenerationConfig = Field(default_factory=LlmGenerationConfig)
42
- picture: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
43
- caption: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
44
- formula: LlmGenerationConfig = Field(default_factory=LlmGenerationConfig)
45
- footnote: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
46
- page_header: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
47
- page_footer: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
48
- page: AutoGenerationConfig = Field(default_factory=AutoGenerationConfig)
49
-
50
- class ChunkProcessing(BaseModel):
51
- target_length: int = 512
52
-
53
- class Property(BaseModel):
54
- name: str
55
- title: Optional[str]
56
- prop_type: str
57
- description: Optional[str]
58
- default: Optional[str]
59
-
60
- class JsonSchema(BaseModel):
61
- title: str
62
- properties: List[Property]
63
- schema_type: Optional[str]
64
-
65
- class OcrStrategy(str, Enum):
66
- ALL = "All"
67
- AUTO = "Auto"
68
-
69
- class SegmentationStrategy(str, Enum):
70
- LAYOUT_ANALYSIS = "LayoutAnalysis"
71
- PAGE = "Page"
72
-
73
- class BoundingBox(BaseModel):
74
- left: float
75
- top: float
76
- width: float
77
- height: float
78
-
79
- class OCRResult(BaseModel):
80
- bbox: BoundingBox
81
- text: str
82
- confidence: Optional[float]
83
-
84
- class SegmentType(str, Enum):
85
- CAPTION = "Caption"
86
- FOOTNOTE = "Footnote"
87
- FORMULA = "Formula"
88
- LIST_ITEM = "ListItem"
89
- PAGE = "Page"
90
- PAGE_FOOTER = "PageFooter"
91
- PAGE_HEADER = "PageHeader"
92
- PICTURE = "Picture"
93
- SECTION_HEADER = "SectionHeader"
94
- TABLE = "Table"
95
- TEXT = "Text"
96
- TITLE = "Title"
97
-
98
- class Segment(BaseModel):
99
- bbox: BoundingBox
100
- content: str
101
- page_height: float
102
- html: Optional[str]
103
- image: Optional[str]
104
- markdown: Optional[str]
105
- ocr: List[OCRResult]
106
- page_number: int
107
- page_width: float
108
- segment_id: str
109
- segment_type: SegmentType
110
-
111
- class Chunk(BaseModel):
112
- chunk_id: str
113
- chunk_length: int
114
- segments: List[Segment]
115
-
116
- class ExtractedJson(BaseModel):
117
- data: Dict
118
-
119
- class OutputResponse(BaseModel):
120
- chunks: List[Chunk] = []
121
- extracted_json: Optional[ExtractedJson]
122
-
123
- class Model(str, Enum):
124
- FAST = "Fast"
125
- HIGH_QUALITY = "HighQuality"
126
-
127
- class Configuration(BaseModel):
128
- chunk_processing: ChunkProcessing = Field(default_factory=ChunkProcessing)
129
- expires_in: Optional[int] = None
130
- high_resolution: bool = False
131
- json_schema: Optional[JsonSchema] = None
132
- model: Optional[Model] = Field(None, deprecated=True)
133
- ocr_strategy: OcrStrategy = OcrStrategy.AUTO
134
- segment_processing: SegmentProcessing = Field(default_factory=SegmentProcessing)
135
- segmentation_strategy: SegmentationStrategy = SegmentationStrategy.LAYOUT_ANALYSIS
136
- target_chunk_length: Optional[int] = Field(None, deprecated=True)
137
-
138
-
139
- class Status(str, Enum):
140
- STARTING = "Starting"
141
- PROCESSING = "Processing"
142
- SUCCEEDED = "Succeeded"
143
- FAILED = "Failed"
144
-
145
- class TaskResponse(BaseModel, HeadersMixin):
146
- configuration: Configuration
147
- created_at: datetime
148
- expires_at: Optional[datetime]
149
- file_name: Optional[str]
150
- finished_at: Optional[datetime]
151
- input_file_url: Optional[str]
152
- message: str
153
- output: Optional[OutputResponse]
154
- page_count: Optional[int]
155
- pdf_url: Optional[str]
156
- status: Status
157
- task_id: str
158
- task_url: Optional[str]
159
- _api_key: Optional[str] = PrivateAttr(default=None)
160
-
161
- def with_api_key(self, api_key: str) -> 'TaskResponse':
162
- """Helper function to set api key on a TaskResponse after creation"""
163
- self._api_key = api_key
164
- return self
165
-
166
- def poll(self) -> 'TaskResponse':
167
- """Poll the task for completion"""
168
- if not self.task_url:
169
- raise ValueError("Task URL not found in response")
170
-
171
- while True:
172
- r = requests.get(self.task_url, headers=self._headers())
173
- r.raise_for_status()
174
- self.__dict__.update(r.json())
175
- if self.status == "Failed":
176
- raise ValueError(self.message)
177
- if self.status not in ("Starting", "Processing"):
178
- return self
179
- time.sleep(0.5)
180
-
181
- async def poll_async(self) -> 'TaskResponse':
182
- """Async poll the task for completion"""
183
- if not self.task_url:
184
- raise ValueError("Task URL not found in response")
185
-
186
- async with httpx.AsyncClient() as client:
187
- while True:
188
- r = await client.get(self.task_url, headers=self._headers())
189
- r.raise_for_status()
190
- self.__dict__.update(r.json())
191
- if self.status == "Failed":
192
- raise ValueError(self.message)
193
- if self.status not in ("Starting", "Processing"):
194
- return self
195
- await asyncio.sleep(0.5)
196
-
197
-
198
- def _get_content(self, content_type: str) -> str:
199
- """Helper method to get either HTML, Markdown, or raw content."""
200
- if not self.output:
201
- return ""
202
- parts = []
203
- for c in self.output.chunks:
204
- for s in c.segments:
205
- content = getattr(s, content_type)
206
- if content:
207
- parts.append(content)
208
- return "\n".join(parts)
209
-
210
- def html(self) -> str:
211
- """Get full HTML for the task"""
212
- return self._get_content("html")
213
-
214
- def markdown(self) -> str:
215
- """Get full markdown for the task"""
216
- return self._get_content("markdown")
217
-
218
- def content(self) -> str:
219
- """Get full text for the task"""
220
- return self._get_content("content")
221
-
222
- class TaskPayload(BaseModel):
223
- current_configuration: Configuration
224
- file_name: str
225
- image_folder_location: str
226
- input_location: str
227
- output_location: str
228
- pdf_location: str
229
- previous_configuration: Optional[Configuration]
230
- task_id: str
231
- user_id: str
@@ -1,16 +0,0 @@
1
- Metadata-Version: 2.2
2
- Name: chunkr-ai
3
- Version: 0.0.2
4
- Summary: Python client for chunkr: open source document intelligence
5
- Author-email: Ishaan Kapoor <ishaan@lumina.sh>
6
- Description-Content-Type: text/markdown
7
- License-File: LICENSE
8
- Requires-Dist: build>=1.2.2.post1
9
- Requires-Dist: httpx>=0.28.1
10
- Requires-Dist: pillow>=11.1.0
11
- Requires-Dist: pydantic>=2.10.4
12
- Requires-Dist: python-dotenv>=1.0.1
13
- Requires-Dist: requests>=2.32.3
14
- Requires-Dist: twine>=6.0.1
15
- Provides-Extra: test
16
- Requires-Dist: pytest>=8.3.4; extra == "test"
@@ -1,12 +0,0 @@
1
- chunkr_ai/__init__.py,sha256=eXygrEhGxxIHXNYIlHF2eied8rGsx2RphgR8Wo4lRyo,110
2
- chunkr_ai/main.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- chunkr_ai/api/api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- chunkr_ai/api/auth.py,sha256=U25WiNQBsrAWYAntuds0zSMvB4gUpAwGoSa5wnQ2LRQ,454
5
- chunkr_ai/api/chunkr.py,sha256=UqFoK8ytCsW1I5F0nM4OD6I4zigy-UHzGuMDtpvMSmE,4454
6
- chunkr_ai/api/chunkr_async.py,sha256=Kfh7_DEon6QTPe-XJops8l9R6rp0zIfJKeh9ZEGFQao,1529
7
- chunkr_ai/api/models.py,sha256=vAVeRHgdSO4SDl009R2Vz75WtuXAwkUZW8ZsVXk9yBA,7221
8
- chunkr_ai-0.0.2.dist-info/LICENSE,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
- chunkr_ai-0.0.2.dist-info/METADATA,sha256=ZK6gdzkukxMEVr1WxodLZ9dZNHar32C00ST1LG9mFl8,519
10
- chunkr_ai-0.0.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
11
- chunkr_ai-0.0.2.dist-info/top_level.txt,sha256=0IZY7PZIiS8bw5r4NUQRUQ-ATi-L_3vLQVq3ZLouOW8,10
12
- chunkr_ai-0.0.2.dist-info/RECORD,,