discovery-engine-api 0.2.98__tar.gz → 0.2.100__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: discovery-engine-api
3
- Version: 0.2.98
3
+ Version: 0.2.100
4
4
  Summary: Python SDK for Disco API
5
5
  Project-URL: Homepage, https://www.leap-labs.com
6
6
  Project-URL: Documentation, https://disco.leap-labs.com/llms-full.txt
@@ -248,7 +248,6 @@ estimate = await engine.estimate(
248
248
  )
249
249
  # estimate["cost"]["credits"] -> 55
250
250
  # estimate["cost"]["price_usd"] -> 5.5
251
- # estimate["time_estimate"]["estimated_seconds"] -> 360
252
251
  # estimate["account"]["sufficient"] -> True/False
253
252
  # estimate["limits"]["max_analysis_depth"] -> 23 (num_columns - 2)
254
253
  ```
@@ -299,7 +298,6 @@ class EngineResult:
299
298
  queue_position: int | None # Position in queue when pending (1 = next up)
300
299
  current_step: str | None # Active pipeline step (preprocessing, training, interpreting, reporting)
301
300
  current_step_message: str | None # Human-readable description of the current step
302
- estimated_seconds: int | None # Estimated total processing time in seconds
303
301
  estimated_wait_seconds: int | None # Estimated queue wait time in seconds (pending only)
304
302
  error_message: str | None
305
303
  report_url: str | None # Shareable link to interactive web report
@@ -211,7 +211,6 @@ estimate = await engine.estimate(
211
211
  )
212
212
  # estimate["cost"]["credits"] -> 55
213
213
  # estimate["cost"]["price_usd"] -> 5.5
214
- # estimate["time_estimate"]["estimated_seconds"] -> 360
215
214
  # estimate["account"]["sufficient"] -> True/False
216
215
  # estimate["limits"]["max_analysis_depth"] -> 23 (num_columns - 2)
217
216
  ```
@@ -262,7 +261,6 @@ class EngineResult:
262
261
  queue_position: int | None # Position in queue when pending (1 = next up)
263
262
  current_step: str | None # Active pipeline step (preprocessing, training, interpreting, reporting)
264
263
  current_step_message: str | None # Human-readable description of the current step
265
- estimated_seconds: int | None # Estimated total processing time in seconds
266
264
  estimated_wait_seconds: int | None # Estimated queue wait time in seconds (pending only)
267
265
  error_message: str | None
268
266
  report_url: str | None # Shareable link to interactive web report
@@ -1,6 +1,6 @@
1
1
  """Disco Python SDK."""
2
2
 
3
- __version__ = "0.2.98"
3
+ __version__ = "0.2.100"
4
4
 
5
5
  from discovery.client import Engine
6
6
  from discovery.types import (
@@ -157,9 +157,6 @@ class Engine:
157
157
  the code interactively, then provisions the account and returns a
158
158
  configured Engine with a ``disco_`` API key.
159
159
 
160
- If the email service is unavailable, falls back to direct provisioning
161
- and returns immediately (no code required).
162
-
163
160
  Args:
164
161
  email: Email address for the new account.
165
162
  name: Display name (optional — defaults to email local part).
@@ -180,7 +177,7 @@ class Engine:
180
177
  cls._raise_for_status(response)
181
178
  data = response.json()
182
179
 
183
- # Direct provisioning fallback (Resend unavailable) already have the key
180
+ # If the server returned a key directly, use it
184
181
  if data.get("key"):
185
182
  engine = cls(api_key=data["key"], quiet=quiet)
186
183
  if not quiet:
@@ -284,7 +281,6 @@ class Engine:
284
281
 
285
282
  This is the primary method. It uploads data, submits the analysis,
286
283
  polls for completion, and returns structured results — all in one call.
287
- Runs typically take 3-15 minutes.
288
284
 
289
285
  Args:
290
286
  file: File path, Path object, or pandas DataFrame.
@@ -440,12 +436,11 @@ class Engine:
440
436
  self,
441
437
  file_size_mb: float,
442
438
  num_columns: int,
443
- num_rows: Optional[int] = None,
444
439
  analysis_depth: int = 2,
445
440
  visibility: str = "public",
446
441
  use_llms: bool = False,
447
442
  ) -> Dict[str, Any]:
448
- """Estimate cost and time for an analysis run.
443
+ """Estimate the credit cost for an analysis run.
449
444
 
450
445
  Works with or without authentication. If authenticated, the response
451
446
  includes your current credit balance and whether you have enough.
@@ -453,7 +448,6 @@ class Engine:
453
448
  Args:
454
449
  file_size_mb: Size of the data file in megabytes.
455
450
  num_columns: Number of columns in the dataset.
456
- num_rows: Number of rows (improves time estimate accuracy).
457
451
  analysis_depth: Depth iterations (1=fast, higher=deeper).
458
452
  visibility: "public" (free, results published) or "private" (costs credits).
459
453
  use_llms: Slower and more expensive, but you get smarter pre-processing,
@@ -461,7 +455,7 @@ class Engine:
461
455
  always use LLMs.
462
456
 
463
457
  Returns:
464
- Dict with ``cost``, ``time_estimate``, ``limits``, and ``account`` info.
458
+ Dict with ``cost``, ``limits``, and ``account`` info.
465
459
  """
466
460
  client = await self._get_dashboard_client()
467
461
  response = await client.post(
@@ -469,7 +463,6 @@ class Engine:
469
463
  json={
470
464
  "file_size_mb": file_size_mb,
471
465
  "num_columns": num_columns,
472
- "num_rows": num_rows,
473
466
  "analysis_depth": analysis_depth,
474
467
  "visibility": visibility,
475
468
  "use_llms": use_llms,
@@ -516,7 +509,6 @@ class Engine:
516
509
  current_step_message=data.get("current_step", {}).get("message")
517
510
  if data.get("current_step")
518
511
  else None,
519
- estimated_seconds=data.get("estimated_seconds"),
520
512
  estimated_wait_seconds=data.get("estimated_wait_seconds"),
521
513
  error_message=data.get("error_message"),
522
514
  )
@@ -600,14 +592,7 @@ class Engine:
600
592
  else ""
601
593
  )
602
594
  step_str = f" ({result.current_step}{msg})"
603
- eta_str = ""
604
- if result.estimated_seconds is not None and elapsed > 0:
605
- remaining = max(0, result.estimated_seconds - elapsed)
606
- if remaining > 0:
607
- eta_str = f" | ETA: ~{max(1, round(remaining / 60))} min"
608
- status_msg = (
609
- f"Status: {result.status}{step_str} | Elapsed: {elapsed:.1f}s{eta_str}"
610
- )
595
+ status_msg = f"Status: {result.status}{step_str} | Elapsed: {elapsed:.1f}s"
611
596
  self._log(f" {status_msg}")
612
597
 
613
598
  last_status = result.status
@@ -638,15 +623,45 @@ class Engine:
638
623
  # File upload
639
624
  # ------------------------------------------------------------------
640
625
 
626
+ # 8 MB chunks — large enough to amortize syscall + TLS overhead, small
627
+ # enough that memory stays bounded regardless of file size.
628
+ _UPLOAD_CHUNK_SIZE = 8 * 1024 * 1024
629
+
630
+ @staticmethod
631
+ async def _stream_file_chunks(path: Path, chunk_size: int):
632
+ """Yield file contents in fixed-size chunks for streaming uploads.
633
+
634
+ Reads from disk via run_in_executor so the event loop isn't blocked
635
+ on the read syscall. Memory stays bounded to one chunk regardless
636
+ of file size — required for multi-GB uploads that previously OOMed
637
+ or hit ``_ssl.c:2426`` when passed as a single bytes object.
638
+ """
639
+ loop = asyncio.get_event_loop()
640
+ with path.open("rb") as f:
641
+ while True:
642
+ chunk = await loop.run_in_executor(None, f.read, chunk_size)
643
+ if not chunk:
644
+ break
645
+ yield chunk
646
+
641
647
  async def _presign_and_upload(
642
648
  self,
643
- file_content: bytes,
649
+ file_source: Union[bytes, Path],
644
650
  filename: str,
645
651
  mime_type: str,
646
652
  ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
647
- """Upload a file using presigned URL (3-step: presign, upload, finalize)."""
653
+ """Upload a file using presigned URL (3-step: presign, upload, finalize).
654
+
655
+ ``file_source`` may be either an in-memory ``bytes`` object (used for
656
+ DataFrame uploads) or a ``Path`` to a file on disk. Path inputs are
657
+ streamed chunk-by-chunk so memory stays bounded for multi-GB files.
658
+ """
648
659
  dashboard_client = await self._get_dashboard_client()
649
- file_size = len(file_content)
660
+
661
+ if isinstance(file_source, Path):
662
+ file_size = file_source.stat().st_size
663
+ else:
664
+ file_size = len(file_source)
650
665
 
651
666
  presign_response = await dashboard_client.post(
652
667
  "/api/data/upload/presign",
@@ -670,11 +685,22 @@ class Engine:
670
685
  key = presign_data["key"]
671
686
  upload_token = presign_data["uploadToken"]
672
687
 
688
+ # GCS XML API does not accept chunked transfer encoding on PUT, so
689
+ # we must always send an explicit Content-Length. For Path inputs we
690
+ # stream from disk; for bytes we let httpx send the buffer directly.
691
+ if isinstance(file_source, Path):
692
+ upload_content = self._stream_file_chunks(file_source, self._UPLOAD_CHUNK_SIZE)
693
+ else:
694
+ upload_content = file_source
695
+
673
696
  async with httpx.AsyncClient(timeout=self._TIMEOUT) as upload_client:
674
697
  upload_response = await upload_client.put(
675
698
  upload_url,
676
- content=file_content,
677
- headers={"Content-Type": mime_type},
699
+ content=upload_content,
700
+ headers={
701
+ "Content-Type": mime_type,
702
+ "Content-Length": str(file_size),
703
+ },
678
704
  )
679
705
  if upload_response.status_code >= 400:
680
706
  raise ValueError(
@@ -702,13 +728,13 @@ class Engine:
702
728
 
703
729
  async def _upload_file_direct(
704
730
  self,
705
- file_content: bytes,
731
+ file_source: Union[bytes, Path],
706
732
  filename: str,
707
733
  mime_type: str,
708
734
  ) -> Dict[str, Any]:
709
735
  """Upload a file using presigned URL. Returns finalize result."""
710
736
  _, finalize_data = await self._presign_and_upload(
711
- file_content=file_content,
737
+ file_source=file_source,
712
738
  filename=filename,
713
739
  mime_type=mime_type,
714
740
  )
@@ -720,7 +746,14 @@ class Engine:
720
746
  filename: Optional[str] = None,
721
747
  title: Optional[str] = None,
722
748
  log: bool = False,
723
- ) -> Tuple[bytes, str, str, float]:
749
+ ) -> Tuple[Union[bytes, Path], str, str, float]:
750
+ """Resolve an upload source into (source, filename, mime_type, size_mb).
751
+
752
+ Returns ``Path`` for file inputs (streamed in ``_presign_and_upload``)
753
+ and ``bytes`` for DataFrame inputs (kept in memory). DataFrame uploads
754
+ are bounded by available RAM; users with multi-GB DataFrames should
755
+ ``df.to_csv(path, index=False)`` and pass the path instead.
756
+ """
724
757
  if pd is not None and isinstance(file, pd.DataFrame):
725
758
  import io
726
759
 
@@ -728,8 +761,8 @@ class Engine:
728
761
  self._log(f"Preparing DataFrame ({len(file)} rows, {len(file.columns)} columns)...")
729
762
  buffer = io.BytesIO()
730
763
  file.to_csv(buffer, index=False)
731
- buffer.seek(0)
732
- file_content = buffer.getvalue()
764
+ file_source: Union[bytes, Path] = buffer.getvalue()
765
+ file_size = len(file_source)
733
766
  resolved_filename = filename or ((title + ".csv") if title else "dataset.csv")
734
767
  mime_type = "text/csv"
735
768
  else:
@@ -738,7 +771,8 @@ class Engine:
738
771
  raise FileNotFoundError(f"File not found: {file_path}")
739
772
  if log:
740
773
  self._log(f"Reading file: {file_path.name}...")
741
- file_content = file_path.read_bytes()
774
+ file_source = file_path
775
+ file_size = file_path.stat().st_size
742
776
  resolved_filename = filename or file_path.name
743
777
  _MIME_TYPES = {
744
778
  ".csv": "text/csv",
@@ -752,10 +786,10 @@ class Engine:
752
786
  }
753
787
  mime_type = _MIME_TYPES.get(file_path.suffix.lower(), "text/csv")
754
788
 
755
- file_size_mb = len(file_content) / (1024 * 1024)
789
+ file_size_mb = file_size / (1024 * 1024)
756
790
  if log:
757
791
  self._log(f" File size: {file_size_mb:.2f} MB")
758
- return file_content, resolved_filename, mime_type, file_size_mb
792
+ return file_source, resolved_filename, mime_type, file_size_mb
759
793
 
760
794
  # ------------------------------------------------------------------
761
795
  # File upload
@@ -776,13 +810,13 @@ class Engine:
776
810
  Dict with ``file`` (key, name, size, fileHash) and ``columns``
777
811
  (list of dicts with ``name``, ``type``, ``enabled``).
778
812
  """
779
- file_content, filename, mime_type, _ = self._prepare_upload(
813
+ file_source, filename, mime_type, _ = self._prepare_upload(
780
814
  file=file,
781
815
  title=title,
782
816
  log=True,
783
817
  )
784
818
  self._log(" Uploading to storage...")
785
- result = await self._upload_file_direct(file_content, filename, mime_type)
819
+ result = await self._upload_file_direct(file_source, filename, mime_type)
786
820
 
787
821
  if not result.get("ok"):
788
822
  errors = result.get("issues", {}).get("errors", [])
@@ -859,7 +893,7 @@ class Engine:
859
893
  f"Creating run from pre-uploaded file (depth: {analysis_depth}, target: {target_column})..."
860
894
  )
861
895
  else:
862
- file_content, filename, mime_type, _ = self._prepare_upload(
896
+ file_source, filename, mime_type, _ = self._prepare_upload(
863
897
  file=file,
864
898
  title=title,
865
899
  log=True,
@@ -868,7 +902,7 @@ class Engine:
868
902
 
869
903
  # Step 1: Upload file
870
904
  self._log(" Uploading to storage...")
871
- raw_result = await self._upload_file_direct(file_content, filename, mime_type)
905
+ raw_result = await self._upload_file_direct(file_source, filename, mime_type)
872
906
 
873
907
  if not raw_result.get("ok"):
874
908
  errors = raw_result.get("issues", {}).get("errors", [])
@@ -1139,7 +1173,6 @@ class Engine:
1139
1173
  current_step_message=data.get("current_step", {}).get("message")
1140
1174
  if data.get("current_step")
1141
1175
  else None,
1142
- estimated_seconds=data.get("estimated_seconds"),
1143
1176
  estimated_wait_seconds=data.get("estimated_wait_seconds"),
1144
1177
  error_message=data.get("error_message"),
1145
1178
  report_url=report_url,
@@ -205,7 +205,6 @@ class EngineResult:
205
205
  queue_position: Optional[int] = None
206
206
  current_step: Optional[str] = None
207
207
  current_step_message: Optional[str] = None
208
- estimated_seconds: Optional[int] = None
209
208
  estimated_wait_seconds: Optional[int] = None
210
209
  error_message: Optional[str] = None
211
210
 
@@ -232,6 +231,5 @@ class RunStatus:
232
231
  queue_position: Optional[int] = None
233
232
  current_step: Optional[str] = None
234
233
  current_step_message: Optional[str] = None
235
- estimated_seconds: Optional[int] = None
236
234
  estimated_wait_seconds: Optional[int] = None
237
235
  error_message: Optional[str] = None
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "discovery-engine-api"
3
- version = "0.2.98"
3
+ version = "0.2.100"
4
4
  description = "Python SDK for Disco API"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"