investify-utils 2.0.0a3__tar.gz → 2.0.0a5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of investify-utils might be problematic. Click here for more details.
- {investify_utils-2.0.0a3 → investify_utils-2.0.0a5}/PKG-INFO +5 -6
- {investify_utils-2.0.0a3 → investify_utils-2.0.0a5}/README.md +0 -3
- investify_utils-2.0.0a5/investify_utils/kafka/__init__.py +43 -0
- investify_utils-2.0.0a5/investify_utils/kafka/async_consumer.py +210 -0
- investify_utils-2.0.0a5/investify_utils/kafka/async_producer.py +151 -0
- investify_utils-2.0.0a5/investify_utils/kafka/sync_consumer.py +201 -0
- investify_utils-2.0.0a5/investify_utils/kafka/sync_producer.py +140 -0
- investify_utils-2.0.0a5/investify_utils/s3/__init__.py +18 -0
- investify_utils-2.0.0a5/investify_utils/s3/sync_client.py +226 -0
- {investify_utils-2.0.0a3 → investify_utils-2.0.0a5}/pyproject.toml +6 -3
- {investify_utils-2.0.0a3 → investify_utils-2.0.0a5}/investify_utils/__init__.py +0 -0
- {investify_utils-2.0.0a3 → investify_utils-2.0.0a5}/investify_utils/postgres/__init__.py +0 -0
- {investify_utils-2.0.0a3 → investify_utils-2.0.0a5}/investify_utils/postgres/async_client.py +0 -0
- {investify_utils-2.0.0a3 → investify_utils-2.0.0a5}/investify_utils/postgres/sync_client.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: investify-utils
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.0a5
|
|
4
4
|
Summary: Shared utilities for Investify services
|
|
5
5
|
Author-Email: Investify <dev@investify.vn>
|
|
6
6
|
License: MIT
|
|
@@ -20,8 +20,10 @@ Provides-Extra: postgres-async
|
|
|
20
20
|
Requires-Dist: pandas>=2.0; extra == "postgres-async"
|
|
21
21
|
Requires-Dist: sqlalchemy>=2.0; extra == "postgres-async"
|
|
22
22
|
Requires-Dist: asyncpg>=0.29; extra == "postgres-async"
|
|
23
|
-
Provides-Extra:
|
|
24
|
-
Requires-Dist:
|
|
23
|
+
Provides-Extra: kafka
|
|
24
|
+
Requires-Dist: confluent-kafka>=2.0; extra == "kafka"
|
|
25
|
+
Provides-Extra: s3
|
|
26
|
+
Requires-Dist: boto3>=1.34; extra == "s3"
|
|
25
27
|
Provides-Extra: dev
|
|
26
28
|
Requires-Dist: pytest; extra == "dev"
|
|
27
29
|
Requires-Dist: pytest-asyncio; extra == "dev"
|
|
@@ -40,9 +42,6 @@ pip install investify-utils[postgres]
|
|
|
40
42
|
|
|
41
43
|
# Async PostgreSQL client (asyncpg + SQLAlchemy)
|
|
42
44
|
pip install investify-utils[postgres-async]
|
|
43
|
-
|
|
44
|
-
# Both clients
|
|
45
|
-
pip install investify-utils[postgres-all]
|
|
46
45
|
```
|
|
47
46
|
|
|
48
47
|
## PostgreSQL Clients
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Kafka Avro producer and consumer clients.
|
|
3
|
+
|
|
4
|
+
Sync (for Celery workers, scripts):
|
|
5
|
+
from investify_utils.kafka import AvroProducer, AvroConsumer
|
|
6
|
+
|
|
7
|
+
Async (for LangGraph, FastAPI):
|
|
8
|
+
from investify_utils.kafka import AsyncAvroProducer, AsyncAvroConsumer
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def __getattr__(name: str):
|
|
13
|
+
"""Lazy import to avoid loading confluent-kafka if not needed."""
|
|
14
|
+
if name == "AvroProducer":
|
|
15
|
+
from investify_utils.kafka.sync_producer import AvroProducer
|
|
16
|
+
|
|
17
|
+
return AvroProducer
|
|
18
|
+
if name == "AvroConsumer":
|
|
19
|
+
from investify_utils.kafka.sync_consumer import AvroConsumer
|
|
20
|
+
|
|
21
|
+
return AvroConsumer
|
|
22
|
+
if name == "OffsetTracker":
|
|
23
|
+
from investify_utils.kafka.sync_consumer import OffsetTracker
|
|
24
|
+
|
|
25
|
+
return OffsetTracker
|
|
26
|
+
if name == "AsyncAvroProducer":
|
|
27
|
+
from investify_utils.kafka.async_producer import AsyncAvroProducer
|
|
28
|
+
|
|
29
|
+
return AsyncAvroProducer
|
|
30
|
+
if name == "AsyncAvroConsumer":
|
|
31
|
+
from investify_utils.kafka.async_consumer import AsyncAvroConsumer
|
|
32
|
+
|
|
33
|
+
return AsyncAvroConsumer
|
|
34
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
__all__ = [
|
|
38
|
+
"AvroProducer",
|
|
39
|
+
"AvroConsumer",
|
|
40
|
+
"OffsetTracker",
|
|
41
|
+
"AsyncAvroProducer",
|
|
42
|
+
"AsyncAvroConsumer",
|
|
43
|
+
]
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Asynchronous Avro consumer using confluent-kafka with asyncio.
|
|
3
|
+
|
|
4
|
+
Features:
|
|
5
|
+
- Non-blocking poll with async/await
|
|
6
|
+
- Suitable for FastAPI websocket streaming
|
|
7
|
+
- Background message fetching with queue
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
from investify_utils.kafka import AsyncAvroConsumer
|
|
11
|
+
|
|
12
|
+
consumer = AsyncAvroConsumer(
|
|
13
|
+
topic="my-topic",
|
|
14
|
+
subject="my-topic-value",
|
|
15
|
+
schema_registry_url="http://localhost:8081",
|
|
16
|
+
bootstrap_servers="localhost:9092",
|
|
17
|
+
group_id="my-consumer-group",
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
# In FastAPI websocket
|
|
21
|
+
@app.websocket("/ws")
|
|
22
|
+
async def websocket_endpoint(websocket: WebSocket):
|
|
23
|
+
await websocket.accept()
|
|
24
|
+
async for key, value in consumer:
|
|
25
|
+
await websocket.send_json(value)
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
import asyncio
|
|
29
|
+
import logging
|
|
30
|
+
from typing import AsyncIterator
|
|
31
|
+
|
|
32
|
+
from confluent_kafka import DeserializingConsumer, TopicPartition
|
|
33
|
+
from confluent_kafka.schema_registry import SchemaRegistryClient
|
|
34
|
+
from confluent_kafka.schema_registry.avro import AvroDeserializer
|
|
35
|
+
from confluent_kafka.serialization import StringDeserializer
|
|
36
|
+
|
|
37
|
+
logger = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class AsyncAvroConsumer:
|
|
41
|
+
"""
|
|
42
|
+
Asynchronous Avro consumer for async frameworks.
|
|
43
|
+
|
|
44
|
+
Uses a background thread for polling (confluent-kafka is not async-native)
|
|
45
|
+
and an asyncio queue to bridge to async code.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
topic: Kafka topic name or list of topics
|
|
49
|
+
subject: Schema Registry subject name
|
|
50
|
+
schema_registry_url: Schema Registry URL
|
|
51
|
+
bootstrap_servers: Kafka bootstrap servers
|
|
52
|
+
group_id: Consumer group ID
|
|
53
|
+
seek_to_end: Start from latest offset (default: False)
|
|
54
|
+
queue_size: Max messages to buffer (default: 100)
|
|
55
|
+
**kwargs: Additional Kafka consumer config
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
def __init__(
|
|
59
|
+
self,
|
|
60
|
+
topic: str | list[str],
|
|
61
|
+
subject: str,
|
|
62
|
+
schema_registry_url: str,
|
|
63
|
+
bootstrap_servers: str,
|
|
64
|
+
group_id: str,
|
|
65
|
+
seek_to_end: bool = False,
|
|
66
|
+
queue_size: int = 100,
|
|
67
|
+
**kwargs,
|
|
68
|
+
):
|
|
69
|
+
self._schema_registry_url = schema_registry_url
|
|
70
|
+
self._bootstrap_servers = bootstrap_servers
|
|
71
|
+
self._subject = subject
|
|
72
|
+
self._group_id = group_id
|
|
73
|
+
self._topic = topic
|
|
74
|
+
self._seek_to_end = seek_to_end
|
|
75
|
+
self._queue_size = queue_size
|
|
76
|
+
self._kwargs = kwargs
|
|
77
|
+
|
|
78
|
+
self._consumer: DeserializingConsumer | None = None
|
|
79
|
+
self._queue: asyncio.Queue | None = None
|
|
80
|
+
self._poll_task: asyncio.Task | None = None
|
|
81
|
+
self._running = False
|
|
82
|
+
|
|
83
|
+
def _init_consumer(self) -> DeserializingConsumer:
|
|
84
|
+
"""Initialize the consumer."""
|
|
85
|
+
schema_registry_client = SchemaRegistryClient({"url": self._schema_registry_url})
|
|
86
|
+
registered_schema = schema_registry_client.get_latest_version(self._subject)
|
|
87
|
+
schema_str = registered_schema.schema.schema_str
|
|
88
|
+
|
|
89
|
+
avro_deserializer = AvroDeserializer(schema_registry_client, schema_str)
|
|
90
|
+
|
|
91
|
+
consumer_config = {
|
|
92
|
+
"bootstrap.servers": self._bootstrap_servers,
|
|
93
|
+
"group.id": self._group_id,
|
|
94
|
+
"key.deserializer": StringDeserializer("utf_8"),
|
|
95
|
+
"value.deserializer": avro_deserializer,
|
|
96
|
+
**self._kwargs,
|
|
97
|
+
}
|
|
98
|
+
consumer = DeserializingConsumer(consumer_config)
|
|
99
|
+
|
|
100
|
+
topic_list = self._topic if isinstance(self._topic, list) else [self._topic]
|
|
101
|
+
|
|
102
|
+
if self._seek_to_end:
|
|
103
|
+
|
|
104
|
+
def seek_to_end_assign(c, partitions):
|
|
105
|
+
for p in partitions:
|
|
106
|
+
high_offset = c.get_watermark_offsets(p)[1]
|
|
107
|
+
p.offset = high_offset
|
|
108
|
+
c.assign(partitions)
|
|
109
|
+
|
|
110
|
+
consumer.subscribe(topic_list, on_assign=seek_to_end_assign)
|
|
111
|
+
else:
|
|
112
|
+
consumer.subscribe(topic_list)
|
|
113
|
+
|
|
114
|
+
return consumer
|
|
115
|
+
|
|
116
|
+
async def start(self) -> None:
|
|
117
|
+
"""Start consuming messages in background."""
|
|
118
|
+
if self._running:
|
|
119
|
+
return
|
|
120
|
+
|
|
121
|
+
self._consumer = self._init_consumer()
|
|
122
|
+
self._queue = asyncio.Queue(maxsize=self._queue_size)
|
|
123
|
+
self._running = True
|
|
124
|
+
self._poll_task = asyncio.create_task(self._poll_loop())
|
|
125
|
+
|
|
126
|
+
async def _poll_loop(self) -> None:
|
|
127
|
+
"""Background task for polling messages."""
|
|
128
|
+
loop = asyncio.get_running_loop()
|
|
129
|
+
|
|
130
|
+
while self._running:
|
|
131
|
+
# Run blocking poll in thread executor
|
|
132
|
+
msg = await loop.run_in_executor(None, self._consumer.poll, 0.1)
|
|
133
|
+
|
|
134
|
+
if msg is None:
|
|
135
|
+
continue
|
|
136
|
+
|
|
137
|
+
if msg.error():
|
|
138
|
+
logger.error(f"Consumer error: {msg.error()}")
|
|
139
|
+
continue
|
|
140
|
+
|
|
141
|
+
# Put message in queue (blocks if full)
|
|
142
|
+
await self._queue.put((msg.key(), msg.value(), msg))
|
|
143
|
+
|
|
144
|
+
async def poll(self, timeout: float = 1.0) -> tuple[str | None, dict | None, object] | None:
|
|
145
|
+
"""
|
|
146
|
+
Poll for a single message.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
timeout: Maximum time to wait in seconds
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
Tuple of (key, value, raw_msg) or None if timeout
|
|
153
|
+
"""
|
|
154
|
+
if not self._running:
|
|
155
|
+
await self.start()
|
|
156
|
+
|
|
157
|
+
try:
|
|
158
|
+
return await asyncio.wait_for(self._queue.get(), timeout=timeout)
|
|
159
|
+
except asyncio.TimeoutError:
|
|
160
|
+
return None
|
|
161
|
+
|
|
162
|
+
async def __aiter__(self) -> AsyncIterator[tuple[str | None, dict | None]]:
|
|
163
|
+
"""
|
|
164
|
+
Async iterator for consuming messages.
|
|
165
|
+
|
|
166
|
+
Yields:
|
|
167
|
+
Tuple of (key, value) for each message
|
|
168
|
+
"""
|
|
169
|
+
if not self._running:
|
|
170
|
+
await self.start()
|
|
171
|
+
|
|
172
|
+
while self._running:
|
|
173
|
+
try:
|
|
174
|
+
key, value, _ = await asyncio.wait_for(self._queue.get(), timeout=1.0)
|
|
175
|
+
yield key, value
|
|
176
|
+
except asyncio.TimeoutError:
|
|
177
|
+
continue
|
|
178
|
+
|
|
179
|
+
def commit(self) -> None:
|
|
180
|
+
"""Commit current offsets."""
|
|
181
|
+
if self._consumer:
|
|
182
|
+
self._consumer.commit()
|
|
183
|
+
|
|
184
|
+
def commit_offsets(self, offsets: list[TopicPartition]) -> None:
|
|
185
|
+
"""Commit specific offsets."""
|
|
186
|
+
if self._consumer:
|
|
187
|
+
self._consumer.commit(offsets=offsets)
|
|
188
|
+
|
|
189
|
+
async def close(self) -> None:
|
|
190
|
+
"""Stop consuming and close the consumer."""
|
|
191
|
+
self._running = False
|
|
192
|
+
|
|
193
|
+
if self._poll_task:
|
|
194
|
+
self._poll_task.cancel()
|
|
195
|
+
try:
|
|
196
|
+
await self._poll_task
|
|
197
|
+
except asyncio.CancelledError:
|
|
198
|
+
pass
|
|
199
|
+
self._poll_task = None
|
|
200
|
+
|
|
201
|
+
if self._consumer:
|
|
202
|
+
self._consumer.close()
|
|
203
|
+
self._consumer = None
|
|
204
|
+
|
|
205
|
+
async def __aenter__(self):
|
|
206
|
+
await self.start()
|
|
207
|
+
return self
|
|
208
|
+
|
|
209
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
210
|
+
await self.close()
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Asynchronous Avro producer using confluent-kafka with asyncio.
|
|
3
|
+
|
|
4
|
+
Features:
|
|
5
|
+
- Non-blocking produce with async/await
|
|
6
|
+
- Background asyncio task for polling
|
|
7
|
+
- Suitable for async frameworks (LangGraph, FastAPI)
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
from investify_utils.kafka import AsyncAvroProducer
|
|
11
|
+
|
|
12
|
+
producer = AsyncAvroProducer(
|
|
13
|
+
topic="my-topic",
|
|
14
|
+
subject="my-topic-value",
|
|
15
|
+
schema_registry_url="http://localhost:8081",
|
|
16
|
+
bootstrap_servers="localhost:9092",
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
# In async context
|
|
20
|
+
await producer.produce(key="key1", value={"field": "value"})
|
|
21
|
+
producer.close()
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
import asyncio
|
|
25
|
+
import logging
|
|
26
|
+
|
|
27
|
+
from confluent_kafka import KafkaException, SerializingProducer
|
|
28
|
+
from confluent_kafka.schema_registry import SchemaRegistryClient, record_subject_name_strategy
|
|
29
|
+
from confluent_kafka.schema_registry.avro import AvroSerializer
|
|
30
|
+
from confluent_kafka.serialization import StringSerializer
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class AsyncAvroProducer:
|
|
36
|
+
"""
|
|
37
|
+
Asynchronous Avro producer for async frameworks.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
topic: Kafka topic name
|
|
41
|
+
subject: Schema Registry subject name
|
|
42
|
+
schema_registry_url: Schema Registry URL
|
|
43
|
+
bootstrap_servers: Kafka bootstrap servers
|
|
44
|
+
**kwargs: Additional Kafka producer config
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
topic: str,
|
|
50
|
+
subject: str,
|
|
51
|
+
schema_registry_url: str,
|
|
52
|
+
bootstrap_servers: str,
|
|
53
|
+
**kwargs,
|
|
54
|
+
):
|
|
55
|
+
self.topic = topic
|
|
56
|
+
self._schema_registry_url = schema_registry_url
|
|
57
|
+
self._bootstrap_servers = bootstrap_servers
|
|
58
|
+
self._subject = subject
|
|
59
|
+
self._kwargs = kwargs
|
|
60
|
+
self._producer: SerializingProducer | None = None
|
|
61
|
+
self._poll_task: asyncio.Task | None = None
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def producer(self) -> SerializingProducer:
|
|
65
|
+
"""Lazy producer initialization."""
|
|
66
|
+
if self._producer is None:
|
|
67
|
+
schema_registry_client = SchemaRegistryClient({"url": self._schema_registry_url})
|
|
68
|
+
registered_schema = schema_registry_client.get_latest_version(self._subject)
|
|
69
|
+
schema_str = registered_schema.schema.schema_str
|
|
70
|
+
|
|
71
|
+
avro_serializer = AvroSerializer(
|
|
72
|
+
schema_registry_client,
|
|
73
|
+
schema_str,
|
|
74
|
+
conf={
|
|
75
|
+
"auto.register.schemas": False,
|
|
76
|
+
"subject.name.strategy": record_subject_name_strategy,
|
|
77
|
+
},
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
producer_config = {
|
|
81
|
+
"bootstrap.servers": self._bootstrap_servers,
|
|
82
|
+
"key.serializer": StringSerializer("utf_8"),
|
|
83
|
+
"value.serializer": avro_serializer,
|
|
84
|
+
**self._kwargs,
|
|
85
|
+
}
|
|
86
|
+
self._producer = SerializingProducer(producer_config)
|
|
87
|
+
|
|
88
|
+
# Start background polling task
|
|
89
|
+
self._poll_task = asyncio.create_task(self._poll_loop())
|
|
90
|
+
|
|
91
|
+
return self._producer
|
|
92
|
+
|
|
93
|
+
async def _poll_loop(self) -> None:
|
|
94
|
+
"""Background task for polling delivery callbacks."""
|
|
95
|
+
while True:
|
|
96
|
+
self._producer.poll(0.1)
|
|
97
|
+
await asyncio.sleep(0.1)
|
|
98
|
+
|
|
99
|
+
async def produce(
|
|
100
|
+
self,
|
|
101
|
+
value: dict,
|
|
102
|
+
key: str | None = None,
|
|
103
|
+
on_delivery: callable | None = None,
|
|
104
|
+
) -> asyncio.Future:
|
|
105
|
+
"""
|
|
106
|
+
Produce a message asynchronously.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
value: Message value (dict matching Avro schema)
|
|
110
|
+
key: Optional message key
|
|
111
|
+
on_delivery: Optional callback(err, msg) for delivery confirmation
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
Future that resolves to the delivered message
|
|
115
|
+
"""
|
|
116
|
+
loop = asyncio.get_running_loop()
|
|
117
|
+
result = loop.create_future()
|
|
118
|
+
|
|
119
|
+
def ack(err, msg):
|
|
120
|
+
if err:
|
|
121
|
+
loop.call_soon_threadsafe(result.set_exception, KafkaException(err))
|
|
122
|
+
else:
|
|
123
|
+
loop.call_soon_threadsafe(result.set_result, msg)
|
|
124
|
+
if on_delivery:
|
|
125
|
+
loop.call_soon_threadsafe(on_delivery, err, msg)
|
|
126
|
+
|
|
127
|
+
self.producer.produce(self.topic, key=key, value=value, on_delivery=ack)
|
|
128
|
+
return await result
|
|
129
|
+
|
|
130
|
+
def flush(self, timeout: float = 10.0) -> int:
|
|
131
|
+
"""
|
|
132
|
+
Wait for all messages to be delivered.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
timeout: Maximum time to wait in seconds
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
Number of messages still in queue
|
|
139
|
+
"""
|
|
140
|
+
if self._producer:
|
|
141
|
+
return self._producer.flush(timeout)
|
|
142
|
+
return 0
|
|
143
|
+
|
|
144
|
+
def close(self) -> None:
|
|
145
|
+
"""Cancel polling task and flush pending messages."""
|
|
146
|
+
if self._poll_task:
|
|
147
|
+
self._poll_task.cancel()
|
|
148
|
+
self._poll_task = None
|
|
149
|
+
if self._producer:
|
|
150
|
+
self._producer.flush()
|
|
151
|
+
self._producer = None
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Synchronous Avro consumer using confluent-kafka.
|
|
3
|
+
|
|
4
|
+
Features:
|
|
5
|
+
- Avro deserialization with Schema Registry
|
|
6
|
+
- Offset tracking for reliable commits
|
|
7
|
+
- Seek to end option for real-time streaming
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
from investify_utils.kafka import AvroConsumer
|
|
11
|
+
|
|
12
|
+
consumer = AvroConsumer(
|
|
13
|
+
topic="my-topic",
|
|
14
|
+
subject="my-topic-value",
|
|
15
|
+
schema_registry_url="http://localhost:8081",
|
|
16
|
+
bootstrap_servers="localhost:9092",
|
|
17
|
+
group_id="my-consumer-group",
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
while True:
|
|
22
|
+
msg = consumer.poll(timeout=1.0)
|
|
23
|
+
if msg is None:
|
|
24
|
+
continue
|
|
25
|
+
print(msg.key(), msg.value())
|
|
26
|
+
consumer.commit()
|
|
27
|
+
finally:
|
|
28
|
+
consumer.close()
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
import logging
|
|
32
|
+
from collections import defaultdict
|
|
33
|
+
|
|
34
|
+
from confluent_kafka import DeserializingConsumer, TopicPartition
|
|
35
|
+
from confluent_kafka.schema_registry import SchemaRegistryClient
|
|
36
|
+
from confluent_kafka.schema_registry.avro import AvroDeserializer
|
|
37
|
+
from confluent_kafka.serialization import StringDeserializer
|
|
38
|
+
|
|
39
|
+
logger = logging.getLogger(__name__)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class OffsetTracker:
|
|
43
|
+
"""
|
|
44
|
+
Track message offsets for reliable commits with out-of-order processing.
|
|
45
|
+
|
|
46
|
+
Only commits offsets where all previous offsets have been marked done,
|
|
47
|
+
preventing data loss when processing messages in parallel.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def __init__(self):
|
|
51
|
+
self._tracker = defaultdict(lambda: defaultdict(lambda: defaultdict(bool)))
|
|
52
|
+
|
|
53
|
+
def log(self, msg, done: bool = False) -> None:
|
|
54
|
+
"""
|
|
55
|
+
Log a message offset.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
msg: Kafka message
|
|
59
|
+
done: Whether processing is complete
|
|
60
|
+
"""
|
|
61
|
+
topic = msg.topic()
|
|
62
|
+
partition = msg.partition()
|
|
63
|
+
offset = msg.offset()
|
|
64
|
+
self._tracker[topic][partition][offset] = done
|
|
65
|
+
|
|
66
|
+
def prepare_commit_offsets(self) -> list[TopicPartition]:
|
|
67
|
+
"""
|
|
68
|
+
Get offsets safe to commit.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
List of TopicPartition with offsets to commit
|
|
72
|
+
"""
|
|
73
|
+
new_tracker = defaultdict(lambda: defaultdict(lambda: defaultdict(bool)))
|
|
74
|
+
to_commit_offsets = []
|
|
75
|
+
|
|
76
|
+
for topic, partitions in self._tracker.items():
|
|
77
|
+
for partition, offsets in partitions.items():
|
|
78
|
+
sorted_offsets = sorted(offsets.items())
|
|
79
|
+
to_commit_offset = None
|
|
80
|
+
update_to_commit_offset = True
|
|
81
|
+
|
|
82
|
+
for offset, done in sorted_offsets:
|
|
83
|
+
update_to_commit_offset = update_to_commit_offset and done
|
|
84
|
+
if update_to_commit_offset:
|
|
85
|
+
to_commit_offset = offset + 1
|
|
86
|
+
else:
|
|
87
|
+
new_tracker[topic][partition][offset] = done
|
|
88
|
+
|
|
89
|
+
if to_commit_offset is not None:
|
|
90
|
+
to_commit_offsets.append(TopicPartition(topic, partition, to_commit_offset))
|
|
91
|
+
|
|
92
|
+
self._tracker = new_tracker
|
|
93
|
+
return to_commit_offsets
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class AvroConsumer:
|
|
97
|
+
"""
|
|
98
|
+
Synchronous Avro consumer.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
topic: Kafka topic name or list of topics
|
|
102
|
+
subject: Schema Registry subject name
|
|
103
|
+
schema_registry_url: Schema Registry URL
|
|
104
|
+
bootstrap_servers: Kafka bootstrap servers
|
|
105
|
+
group_id: Consumer group ID
|
|
106
|
+
seek_to_end: Start from latest offset (default: False)
|
|
107
|
+
**kwargs: Additional Kafka consumer config
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
def __init__(
|
|
111
|
+
self,
|
|
112
|
+
topic: str | list[str],
|
|
113
|
+
subject: str,
|
|
114
|
+
schema_registry_url: str,
|
|
115
|
+
bootstrap_servers: str,
|
|
116
|
+
group_id: str,
|
|
117
|
+
seek_to_end: bool = False,
|
|
118
|
+
**kwargs,
|
|
119
|
+
):
|
|
120
|
+
self._schema_registry_url = schema_registry_url
|
|
121
|
+
self._bootstrap_servers = bootstrap_servers
|
|
122
|
+
self._subject = subject
|
|
123
|
+
self._group_id = group_id
|
|
124
|
+
self._topic = topic
|
|
125
|
+
self._seek_to_end = seek_to_end
|
|
126
|
+
self._kwargs = kwargs
|
|
127
|
+
self._consumer: DeserializingConsumer | None = None
|
|
128
|
+
|
|
129
|
+
@property
|
|
130
|
+
def consumer(self) -> DeserializingConsumer:
|
|
131
|
+
"""Lazy consumer initialization."""
|
|
132
|
+
if self._consumer is None:
|
|
133
|
+
schema_registry_client = SchemaRegistryClient({"url": self._schema_registry_url})
|
|
134
|
+
registered_schema = schema_registry_client.get_latest_version(self._subject)
|
|
135
|
+
schema_str = registered_schema.schema.schema_str
|
|
136
|
+
|
|
137
|
+
avro_deserializer = AvroDeserializer(schema_registry_client, schema_str)
|
|
138
|
+
|
|
139
|
+
consumer_config = {
|
|
140
|
+
"bootstrap.servers": self._bootstrap_servers,
|
|
141
|
+
"group.id": self._group_id,
|
|
142
|
+
"key.deserializer": StringDeserializer("utf_8"),
|
|
143
|
+
"value.deserializer": avro_deserializer,
|
|
144
|
+
**self._kwargs,
|
|
145
|
+
}
|
|
146
|
+
self._consumer = DeserializingConsumer(consumer_config)
|
|
147
|
+
|
|
148
|
+
topic_list = self._topic if isinstance(self._topic, list) else [self._topic]
|
|
149
|
+
|
|
150
|
+
if self._seek_to_end:
|
|
151
|
+
|
|
152
|
+
def seek_to_end_assign(consumer, partitions):
|
|
153
|
+
for p in partitions:
|
|
154
|
+
high_offset = consumer.get_watermark_offsets(p)[1]
|
|
155
|
+
p.offset = high_offset
|
|
156
|
+
consumer.assign(partitions)
|
|
157
|
+
|
|
158
|
+
self._consumer.subscribe(topic_list, on_assign=seek_to_end_assign)
|
|
159
|
+
else:
|
|
160
|
+
self._consumer.subscribe(topic_list)
|
|
161
|
+
|
|
162
|
+
return self._consumer
|
|
163
|
+
|
|
164
|
+
def poll(self, timeout: float = 1.0):
|
|
165
|
+
"""
|
|
166
|
+
Poll for a message.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
timeout: Maximum time to wait in seconds
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
Message object or None if no message available
|
|
173
|
+
"""
|
|
174
|
+
msg = self.consumer.poll(timeout)
|
|
175
|
+
if msg is None:
|
|
176
|
+
return None
|
|
177
|
+
|
|
178
|
+
if msg.error():
|
|
179
|
+
logger.error(f"Consumer error: {msg.error()}")
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
return msg
|
|
183
|
+
|
|
184
|
+
def commit(self) -> None:
|
|
185
|
+
"""Commit current offsets."""
|
|
186
|
+
self.consumer.commit()
|
|
187
|
+
|
|
188
|
+
def commit_offsets(self, offsets: list[TopicPartition]) -> None:
|
|
189
|
+
"""
|
|
190
|
+
Commit specific offsets.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
offsets: List of TopicPartition with offsets to commit
|
|
194
|
+
"""
|
|
195
|
+
self.consumer.commit(offsets=offsets)
|
|
196
|
+
|
|
197
|
+
def close(self) -> None:
|
|
198
|
+
"""Close the consumer."""
|
|
199
|
+
if self._consumer:
|
|
200
|
+
self._consumer.close()
|
|
201
|
+
self._consumer = None
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Synchronous Avro producer using confluent-kafka.
|
|
3
|
+
|
|
4
|
+
Features:
|
|
5
|
+
- Avro serialization with Schema Registry
|
|
6
|
+
- Background polling thread for delivery callbacks
|
|
7
|
+
- Thread-safe produce operations
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
from investify_utils.kafka import AvroProducer
|
|
11
|
+
|
|
12
|
+
producer = AvroProducer(
|
|
13
|
+
topic="my-topic",
|
|
14
|
+
subject="my-topic-value",
|
|
15
|
+
schema_registry_url="http://localhost:8081",
|
|
16
|
+
bootstrap_servers="localhost:9092",
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
producer.produce(key="key1", value={"field": "value"})
|
|
20
|
+
producer.flush()
|
|
21
|
+
producer.close()
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
import logging
|
|
25
|
+
import threading
|
|
26
|
+
|
|
27
|
+
from confluent_kafka import SerializingProducer
|
|
28
|
+
from confluent_kafka.schema_registry import SchemaRegistryClient, record_subject_name_strategy
|
|
29
|
+
from confluent_kafka.schema_registry.avro import AvroSerializer
|
|
30
|
+
from confluent_kafka.serialization import StringSerializer
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class AvroProducer:
|
|
36
|
+
"""
|
|
37
|
+
Synchronous Avro producer with background polling.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
topic: Kafka topic name
|
|
41
|
+
subject: Schema Registry subject name (usually "{topic}-value")
|
|
42
|
+
schema_registry_url: Schema Registry URL
|
|
43
|
+
bootstrap_servers: Kafka bootstrap servers
|
|
44
|
+
**kwargs: Additional Kafka producer config
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
topic: str,
|
|
50
|
+
subject: str,
|
|
51
|
+
schema_registry_url: str,
|
|
52
|
+
bootstrap_servers: str,
|
|
53
|
+
**kwargs,
|
|
54
|
+
):
|
|
55
|
+
self.topic = topic
|
|
56
|
+
self._schema_registry_url = schema_registry_url
|
|
57
|
+
self._bootstrap_servers = bootstrap_servers
|
|
58
|
+
self._subject = subject
|
|
59
|
+
self._kwargs = kwargs
|
|
60
|
+
self._producer: SerializingProducer | None = None
|
|
61
|
+
self._shutdown_event: threading.Event | None = None
|
|
62
|
+
self._poll_thread: threading.Thread | None = None
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def producer(self) -> SerializingProducer:
|
|
66
|
+
"""Lazy producer initialization."""
|
|
67
|
+
if self._producer is None:
|
|
68
|
+
schema_registry_client = SchemaRegistryClient({"url": self._schema_registry_url})
|
|
69
|
+
registered_schema = schema_registry_client.get_latest_version(self._subject)
|
|
70
|
+
schema_str = registered_schema.schema.schema_str
|
|
71
|
+
|
|
72
|
+
avro_serializer = AvroSerializer(
|
|
73
|
+
schema_registry_client,
|
|
74
|
+
schema_str,
|
|
75
|
+
conf={
|
|
76
|
+
"auto.register.schemas": False,
|
|
77
|
+
"subject.name.strategy": record_subject_name_strategy,
|
|
78
|
+
},
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
producer_config = {
|
|
82
|
+
"bootstrap.servers": self._bootstrap_servers,
|
|
83
|
+
"key.serializer": StringSerializer("utf_8"),
|
|
84
|
+
"value.serializer": avro_serializer,
|
|
85
|
+
**self._kwargs,
|
|
86
|
+
}
|
|
87
|
+
self._producer = SerializingProducer(producer_config)
|
|
88
|
+
|
|
89
|
+
# Start background polling thread
|
|
90
|
+
self._shutdown_event = threading.Event()
|
|
91
|
+
self._poll_thread = threading.Thread(target=self._poll_loop, daemon=True)
|
|
92
|
+
self._poll_thread.start()
|
|
93
|
+
|
|
94
|
+
return self._producer
|
|
95
|
+
|
|
96
|
+
def _poll_loop(self) -> None:
|
|
97
|
+
"""Background thread for polling delivery callbacks."""
|
|
98
|
+
while not self._shutdown_event.is_set():
|
|
99
|
+
self._producer.poll(0.1)
|
|
100
|
+
self._shutdown_event.wait(0.1)
|
|
101
|
+
|
|
102
|
+
def produce(
|
|
103
|
+
self,
|
|
104
|
+
value: dict,
|
|
105
|
+
key: str | None = None,
|
|
106
|
+
on_delivery: callable | None = None,
|
|
107
|
+
) -> None:
|
|
108
|
+
"""
|
|
109
|
+
Produce a message to Kafka.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
value: Message value (dict matching Avro schema)
|
|
113
|
+
key: Optional message key
|
|
114
|
+
on_delivery: Optional callback(err, msg) for delivery confirmation
|
|
115
|
+
"""
|
|
116
|
+
try:
|
|
117
|
+
self.producer.produce(self.topic, key=key, value=value, on_delivery=on_delivery)
|
|
118
|
+
except Exception as e:
|
|
119
|
+
logger.error(f"Failed to produce message: {e!r}")
|
|
120
|
+
raise
|
|
121
|
+
|
|
122
|
+
def flush(self, timeout: float = 10.0) -> int:
|
|
123
|
+
"""
|
|
124
|
+
Wait for all messages to be delivered.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
timeout: Maximum time to wait in seconds
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
Number of messages still in queue (0 if all delivered)
|
|
131
|
+
"""
|
|
132
|
+
return self.producer.flush(timeout)
|
|
133
|
+
|
|
134
|
+
def close(self) -> None:
|
|
135
|
+
"""Flush pending messages and stop background polling."""
|
|
136
|
+
if self._shutdown_event:
|
|
137
|
+
self._shutdown_event.set()
|
|
138
|
+
if self._producer:
|
|
139
|
+
self._producer.flush()
|
|
140
|
+
self._producer = None
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""
|
|
2
|
+
S3-compatible object storage client.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
from investify_utils.s3 import S3Client
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def __getattr__(name: str):
|
|
10
|
+
"""Lazy import to avoid loading boto3 if not needed."""
|
|
11
|
+
if name == "S3Client":
|
|
12
|
+
from investify_utils.s3.sync_client import S3Client
|
|
13
|
+
|
|
14
|
+
return S3Client
|
|
15
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
__all__ = ["S3Client"]
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
"""
|
|
2
|
+
S3-compatible object storage client using boto3.
|
|
3
|
+
|
|
4
|
+
Features:
|
|
5
|
+
- Works with AWS S3, Ceph RGW, MinIO, and other S3-compatible services
|
|
6
|
+
- Lazy client initialization (safe for Celery prefork)
|
|
7
|
+
- Common operations: upload, download, get, put, delete, list
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
from investify_utils.s3 import S3Client
|
|
11
|
+
|
|
12
|
+
client = S3Client(
|
|
13
|
+
endpoint_url="https://s3.example.com",
|
|
14
|
+
access_key="access_key",
|
|
15
|
+
secret_key="secret_key",
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
# Upload file
|
|
19
|
+
client.upload_file("local.pdf", bucket="my-bucket", key="remote.pdf")
|
|
20
|
+
|
|
21
|
+
# Get object as bytes
|
|
22
|
+
data = client.get_object("my-bucket", "remote.pdf")
|
|
23
|
+
|
|
24
|
+
# Put object from bytes/string
|
|
25
|
+
client.put_object("my-bucket", "file.txt", b"content", content_type="text/plain")
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
import io
|
|
29
|
+
import os
|
|
30
|
+
from typing import IO
|
|
31
|
+
|
|
32
|
+
import boto3
|
|
33
|
+
from botocore.config import Config
|
|
34
|
+
from botocore.exceptions import ClientError
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class S3Client:
|
|
38
|
+
"""
|
|
39
|
+
S3-compatible object storage client with lazy initialization.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
endpoint_url: S3 endpoint URL (e.g., https://s3.amazonaws.com)
|
|
43
|
+
access_key: AWS access key ID
|
|
44
|
+
secret_key: AWS secret access key
|
|
45
|
+
region: AWS region (default: None)
|
|
46
|
+
**kwargs: Additional boto3 client options
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
endpoint_url: str,
|
|
52
|
+
access_key: str,
|
|
53
|
+
secret_key: str,
|
|
54
|
+
region: str | None = None,
|
|
55
|
+
**kwargs,
|
|
56
|
+
):
|
|
57
|
+
self._endpoint_url = endpoint_url
|
|
58
|
+
self._access_key = access_key
|
|
59
|
+
self._secret_key = secret_key
|
|
60
|
+
self._region = region
|
|
61
|
+
self._kwargs = kwargs
|
|
62
|
+
self._client = None
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def client(self):
|
|
66
|
+
"""Lazy client initialization - created on first access."""
|
|
67
|
+
if self._client is None:
|
|
68
|
+
self._client = boto3.client(
|
|
69
|
+
"s3",
|
|
70
|
+
endpoint_url=self._endpoint_url,
|
|
71
|
+
aws_access_key_id=self._access_key,
|
|
72
|
+
aws_secret_access_key=self._secret_key,
|
|
73
|
+
region_name=self._region,
|
|
74
|
+
config=Config(signature_version="s3v4"),
|
|
75
|
+
**self._kwargs,
|
|
76
|
+
)
|
|
77
|
+
return self._client
|
|
78
|
+
|
|
79
|
+
def list_buckets(self) -> list[str]:
|
|
80
|
+
"""List all buckets."""
|
|
81
|
+
response = self.client.list_buckets()
|
|
82
|
+
return [bucket["Name"] for bucket in response["Buckets"]]
|
|
83
|
+
|
|
84
|
+
def list_objects(
|
|
85
|
+
self,
|
|
86
|
+
bucket: str,
|
|
87
|
+
prefix: str = "",
|
|
88
|
+
max_keys: int | None = None,
|
|
89
|
+
) -> list[dict]:
|
|
90
|
+
"""
|
|
91
|
+
List objects in a bucket with optional prefix filter.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
bucket: Bucket name
|
|
95
|
+
prefix: Filter objects by prefix (e.g., "folder/")
|
|
96
|
+
max_keys: Maximum number of objects to return (None = all)
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
List of object metadata dicts with keys: Key, Size, LastModified
|
|
100
|
+
"""
|
|
101
|
+
objects = []
|
|
102
|
+
paginator = self.client.get_paginator("list_objects_v2")
|
|
103
|
+
|
|
104
|
+
for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
|
|
105
|
+
for obj in page.get("Contents", []):
|
|
106
|
+
objects.append({
|
|
107
|
+
"Key": obj["Key"],
|
|
108
|
+
"Size": obj["Size"],
|
|
109
|
+
"LastModified": obj["LastModified"],
|
|
110
|
+
})
|
|
111
|
+
if max_keys and len(objects) >= max_keys:
|
|
112
|
+
return objects
|
|
113
|
+
|
|
114
|
+
return objects
|
|
115
|
+
|
|
116
|
+
def upload_file(self, file_path: str, bucket: str, key: str | None = None) -> None:
|
|
117
|
+
"""
|
|
118
|
+
Upload a local file to S3.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
file_path: Local file path
|
|
122
|
+
bucket: Bucket name
|
|
123
|
+
key: Object key (default: basename of file_path)
|
|
124
|
+
"""
|
|
125
|
+
if key is None:
|
|
126
|
+
key = os.path.basename(file_path)
|
|
127
|
+
self.client.upload_file(file_path, bucket, key)
|
|
128
|
+
|
|
129
|
+
def download_file(self, bucket: str, key: str, file_path: str) -> None:
|
|
130
|
+
"""
|
|
131
|
+
Download an object to a local file.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
bucket: Bucket name
|
|
135
|
+
key: Object key
|
|
136
|
+
file_path: Local file path to save to
|
|
137
|
+
"""
|
|
138
|
+
self.client.download_file(bucket, key, file_path)
|
|
139
|
+
|
|
140
|
+
def get_object(self, bucket: str, key: str) -> IO[bytes]:
|
|
141
|
+
"""
|
|
142
|
+
Get object content as a file-like BytesIO object.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
bucket: Bucket name
|
|
146
|
+
key: Object key
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
BytesIO object with object content
|
|
150
|
+
"""
|
|
151
|
+
response = self.client.get_object(Bucket=bucket, Key=key)
|
|
152
|
+
return io.BytesIO(response["Body"].read())
|
|
153
|
+
|
|
154
|
+
def put_object(
|
|
155
|
+
self,
|
|
156
|
+
bucket: str,
|
|
157
|
+
key: str,
|
|
158
|
+
data: str | bytes | IO[bytes],
|
|
159
|
+
content_type: str | None = None,
|
|
160
|
+
content_disposition: str | None = None,
|
|
161
|
+
) -> None:
|
|
162
|
+
"""
|
|
163
|
+
Upload data directly to S3.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
bucket: Bucket name
|
|
167
|
+
key: Object key
|
|
168
|
+
data: Content as string, bytes, or file-like object
|
|
169
|
+
content_type: MIME type (e.g., "application/pdf")
|
|
170
|
+
content_disposition: Content-Disposition header value
|
|
171
|
+
"""
|
|
172
|
+
params = {"Bucket": bucket, "Key": key, "Body": data}
|
|
173
|
+
if content_type:
|
|
174
|
+
params["ContentType"] = content_type
|
|
175
|
+
if content_disposition:
|
|
176
|
+
params["ContentDisposition"] = content_disposition
|
|
177
|
+
self.client.put_object(**params)
|
|
178
|
+
|
|
179
|
+
def delete_object(self, bucket: str, key: str) -> None:
|
|
180
|
+
"""Delete a single object."""
|
|
181
|
+
self.client.delete_object(Bucket=bucket, Key=key)
|
|
182
|
+
|
|
183
|
+
def delete_prefix(self, bucket: str, prefix: str) -> int:
|
|
184
|
+
"""
|
|
185
|
+
Delete all objects with a given prefix.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
bucket: Bucket name
|
|
189
|
+
prefix: Prefix to match (e.g., "folder/" deletes all in folder)
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
Number of objects deleted
|
|
193
|
+
"""
|
|
194
|
+
deleted_count = 0
|
|
195
|
+
paginator = self.client.get_paginator("list_objects_v2")
|
|
196
|
+
|
|
197
|
+
for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
|
|
198
|
+
contents = page.get("Contents", [])
|
|
199
|
+
if not contents:
|
|
200
|
+
continue
|
|
201
|
+
|
|
202
|
+
delete_keys = [{"Key": obj["Key"]} for obj in contents]
|
|
203
|
+
self.client.delete_objects(Bucket=bucket, Delete={"Objects": delete_keys})
|
|
204
|
+
deleted_count += len(delete_keys)
|
|
205
|
+
|
|
206
|
+
return deleted_count
|
|
207
|
+
|
|
208
|
+
def exists(self, bucket: str, key: str) -> bool:
|
|
209
|
+
"""
|
|
210
|
+
Check if an object exists.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
bucket: Bucket name
|
|
214
|
+
key: Object key
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
True if object exists, False otherwise
|
|
218
|
+
"""
|
|
219
|
+
try:
|
|
220
|
+
self.client.head_object(Bucket=bucket, Key=key)
|
|
221
|
+
return True
|
|
222
|
+
except ClientError as e:
|
|
223
|
+
if e.response["Error"]["Code"] == "404":
|
|
224
|
+
return False
|
|
225
|
+
raise
|
|
226
|
+
|
|
@@ -6,7 +6,7 @@ build-backend = "pdm.backend"
|
|
|
6
6
|
|
|
7
7
|
[project]
|
|
8
8
|
name = "investify-utils"
|
|
9
|
-
version = "2.0.
|
|
9
|
+
version = "2.0.0a5"
|
|
10
10
|
description = "Shared utilities for Investify services"
|
|
11
11
|
readme = "README.md"
|
|
12
12
|
requires-python = ">=3.12"
|
|
@@ -40,8 +40,11 @@ postgres-async = [
|
|
|
40
40
|
"sqlalchemy>=2.0",
|
|
41
41
|
"asyncpg>=0.29",
|
|
42
42
|
]
|
|
43
|
-
|
|
44
|
-
"
|
|
43
|
+
kafka = [
|
|
44
|
+
"confluent-kafka>=2.0",
|
|
45
|
+
]
|
|
46
|
+
s3 = [
|
|
47
|
+
"boto3>=1.34",
|
|
45
48
|
]
|
|
46
49
|
dev = [
|
|
47
50
|
"pytest",
|
|
File without changes
|
|
File without changes
|
{investify_utils-2.0.0a3 → investify_utils-2.0.0a5}/investify_utils/postgres/async_client.py
RENAMED
|
File without changes
|
|
File without changes
|