lightly-studio 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lightly-studio might be problematic. Click here for more details.

Files changed (137) hide show
  1. lightly_studio/api/app.py +2 -0
  2. lightly_studio/api/features.py +3 -5
  3. lightly_studio/api/routes/api/caption.py +30 -0
  4. lightly_studio/api/routes/api/dataset_tag.py +10 -0
  5. lightly_studio/api/routes/api/embeddings2d.py +42 -39
  6. lightly_studio/api/routes/api/metadata.py +57 -1
  7. lightly_studio/core/add_samples.py +138 -0
  8. lightly_studio/core/dataset.py +232 -18
  9. lightly_studio/core/dataset_query/__init__.py +14 -0
  10. lightly_studio/core/sample.py +33 -1
  11. lightly_studio/dataset/loader.py +2 -8
  12. lightly_studio/db_manager.py +14 -6
  13. lightly_studio/dist_lightly_studio_view_app/_app/env.js +1 -1
  14. lightly_studio/dist_lightly_studio_view_app/_app/immutable/assets/0.CN4hnTks.css +1 -0
  15. lightly_studio/dist_lightly_studio_view_app/_app/immutable/assets/2.CkOblLn7.css +1 -0
  16. lightly_studio/dist_lightly_studio_view_app/_app/immutable/assets/Samples.C0_eo9eP.css +1 -0
  17. lightly_studio/dist_lightly_studio_view_app/_app/immutable/assets/{useFeatureFlags.CV-KWLNP.css → _layout.CefECEWA.css} +1 -1
  18. lightly_studio/dist_lightly_studio_view_app/_app/immutable/assets/_layout.kFFGI0zL.css +1 -0
  19. lightly_studio/dist_lightly_studio_view_app/_app/immutable/assets/transform.sLzR40om.css +1 -0
  20. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/{6t3IJ0vQ.js → BOmrKuMn.js} +1 -1
  21. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/{Cs1XmhiF.js → BPpOWbDa.js} +1 -1
  22. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/BaFFwDFr.js +1 -0
  23. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/BiGQqqJP.js +1 -0
  24. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/BrNKoXwc.js +20 -0
  25. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/BsaJCCG_.js +96 -0
  26. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/BtXGzlpP.js +20 -0
  27. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/C1FmrZbK.js +1 -0
  28. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/C3xJX0nD.js +1 -0
  29. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/CANX9QXL.js +1 -0
  30. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/CAPx0Bfm.js +1 -0
  31. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/CP9M7pei.js +39 -0
  32. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/CWuDkrMZ.js +436 -0
  33. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/ChlxSwqI.js +1 -0
  34. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/Cj4nZbtb.js +1 -0
  35. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/ClzkJBWk.js +1 -0
  36. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/CpbA3HU7.js +2 -0
  37. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/D8ZGoCPm.js +3 -0
  38. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/DMJzr1NB.js +1 -0
  39. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/{BdfTHw61.js → DNJnBfHs.js} +1 -1
  40. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/{keKYsoph.js → DUtlYNuP.js} +1 -1
  41. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/DVxjPOJB.js +1 -0
  42. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/DmGM9V9Q.js +1 -0
  43. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/DoEId1MK.js +1 -0
  44. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/DthpwYR_.js +2 -0
  45. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/DyIcJj6J.js +1 -0
  46. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/SiegjVo0.js +1 -0
  47. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/{BfHVnyNT.js → WEyXQRi6.js} +1 -1
  48. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/gBp1tBnA.js +1 -0
  49. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/xQhUoIl9.js +1 -0
  50. lightly_studio/dist_lightly_studio_view_app/_app/immutable/entry/app.Y-sSoz5q.js +2 -0
  51. lightly_studio/dist_lightly_studio_view_app/_app/immutable/entry/start.CvxVp0Cu.js +1 -0
  52. lightly_studio/dist_lightly_studio_view_app/_app/immutable/nodes/0.0Fm6E-5B.js +4 -0
  53. lightly_studio/dist_lightly_studio_view_app/_app/immutable/nodes/1.DB-0vkHb.js +1 -0
  54. lightly_studio/dist_lightly_studio_view_app/_app/immutable/nodes/10.vaUePh5k.js +1 -0
  55. lightly_studio/dist_lightly_studio_view_app/_app/immutable/nodes/11.7i7ljNVT.js +1 -0
  56. lightly_studio/dist_lightly_studio_view_app/_app/immutable/nodes/13.9qy3WtZv.js +1 -0
  57. lightly_studio/dist_lightly_studio_view_app/_app/immutable/nodes/{2.C8HLK8mj.js → 2.Drwwdm7A.js} +267 -111
  58. lightly_studio/dist_lightly_studio_view_app/_app/immutable/nodes/{3.CLvg3QcJ.js → 3.D3X_-Wan.js} +1 -1
  59. lightly_studio/dist_lightly_studio_view_app/_app/immutable/nodes/{4.BQhDtXUI.js → 4.C9TqY3tA.js} +1 -1
  60. lightly_studio/dist_lightly_studio_view_app/_app/immutable/nodes/5.iRw6HCWX.js +39 -0
  61. lightly_studio/dist_lightly_studio_view_app/_app/immutable/nodes/{6.uBV1Lhat.js → 6.fqfYR7dB.js} +1 -1
  62. lightly_studio/dist_lightly_studio_view_app/_app/immutable/nodes/7.C7gMM-gk.js +1 -0
  63. lightly_studio/dist_lightly_studio_view_app/_app/immutable/nodes/8.C4v1w-oS.js +20 -0
  64. lightly_studio/dist_lightly_studio_view_app/_app/immutable/nodes/9.DbHcSiMn.js +1 -0
  65. lightly_studio/dist_lightly_studio_view_app/_app/version.json +1 -1
  66. lightly_studio/dist_lightly_studio_view_app/index.html +15 -14
  67. lightly_studio/examples/example.py +4 -0
  68. lightly_studio/examples/example_coco.py +4 -0
  69. lightly_studio/examples/example_coco_caption.py +24 -0
  70. lightly_studio/examples/example_metadata.py +4 -1
  71. lightly_studio/examples/example_selection.py +4 -0
  72. lightly_studio/examples/example_split_work.py +4 -0
  73. lightly_studio/examples/example_yolo.py +4 -0
  74. lightly_studio/export/export_dataset.py +11 -3
  75. lightly_studio/metadata/compute_typicality.py +1 -1
  76. lightly_studio/models/caption.py +74 -0
  77. lightly_studio/models/dataset.py +1 -2
  78. lightly_studio/models/metadata.py +1 -1
  79. lightly_studio/models/sample.py +9 -2
  80. lightly_studio/models/settings.py +5 -0
  81. lightly_studio/resolvers/caption_resolver.py +80 -0
  82. lightly_studio/resolvers/dataset_resolver.py +6 -11
  83. lightly_studio/resolvers/metadata_resolver/__init__.py +2 -2
  84. lightly_studio/resolvers/metadata_resolver/sample/__init__.py +3 -3
  85. lightly_studio/resolvers/metadata_resolver/sample/bulk_update_metadata.py +46 -0
  86. lightly_studio/resolvers/sample_resolver.py +1 -0
  87. lightly_studio/resolvers/samples_filter.py +18 -10
  88. lightly_studio/resolvers/settings_resolver.py +3 -0
  89. lightly_studio/resolvers/twodim_embedding_resolver.py +29 -0
  90. lightly_studio/selection/__init__.py +1 -0
  91. lightly_studio/selection/mundig.py +41 -0
  92. lightly_studio/type_definitions.py +2 -0
  93. lightly_studio-0.4.0.dist-info/METADATA +78 -0
  94. {lightly_studio-0.3.3.dist-info → lightly_studio-0.4.0.dist-info}/RECORD +96 -88
  95. lightly_studio/dist_lightly_studio_view_app/_app/immutable/assets/0.CA_CXIBb.css +0 -1
  96. lightly_studio/dist_lightly_studio_view_app/_app/immutable/assets/_layout.DS78jgNY.css +0 -1
  97. lightly_studio/dist_lightly_studio_view_app/_app/immutable/assets/index.BVs_sZj9.css +0 -1
  98. lightly_studio/dist_lightly_studio_view_app/_app/immutable/assets/transform.D487hwJk.css +0 -1
  99. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/8NsknIT2.js +0 -1
  100. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/BND_-4Kp.js +0 -1
  101. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/BjkP1AHA.js +0 -1
  102. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/BuuNVL9G.js +0 -1
  103. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/BzKGpnl4.js +0 -1
  104. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/CCx7Ho51.js +0 -1
  105. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/CH6P3X75.js +0 -1
  106. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/CR2upx_Q.js +0 -4
  107. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/CWPZrTTJ.js +0 -1
  108. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/CwPowJfP.js +0 -1
  109. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/CxFKfZ9T.js +0 -1
  110. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/Cxevwdid.js +0 -1
  111. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/D4whDBUi.js +0 -1
  112. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/D6r9vr07.js +0 -1
  113. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/DA6bFLPR.js +0 -1
  114. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/DEgUu98i.js +0 -3
  115. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/DGTPl6Gk.js +0 -1
  116. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/DKGxBSlK.js +0 -1
  117. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/DQXoLcsF.js +0 -1
  118. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/DQe_kdRt.js +0 -92
  119. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/DcY4jgG3.js +0 -1
  120. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/H7C68rOM.js +0 -1
  121. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/RmD8FzRo.js +0 -1
  122. lightly_studio/dist_lightly_studio_view_app/_app/immutable/chunks/V-MnMC1X.js +0 -1
  123. lightly_studio/dist_lightly_studio_view_app/_app/immutable/entry/app.BVr6DYqP.js +0 -2
  124. lightly_studio/dist_lightly_studio_view_app/_app/immutable/entry/start.u7zsVvqp.js +0 -1
  125. lightly_studio/dist_lightly_studio_view_app/_app/immutable/nodes/0.Da2agmdd.js +0 -1
  126. lightly_studio/dist_lightly_studio_view_app/_app/immutable/nodes/1.B11tVRJV.js +0 -1
  127. lightly_studio/dist_lightly_studio_view_app/_app/immutable/nodes/10.l30Zud4h.js +0 -1
  128. lightly_studio/dist_lightly_studio_view_app/_app/immutable/nodes/12.CgKPGcAP.js +0 -1
  129. lightly_studio/dist_lightly_studio_view_app/_app/immutable/nodes/5.-6XqWX5G.js +0 -1
  130. lightly_studio/dist_lightly_studio_view_app/_app/immutable/nodes/7.BXsgoQZh.js +0 -1
  131. lightly_studio/dist_lightly_studio_view_app/_app/immutable/nodes/8.BkbcnUs8.js +0 -1
  132. lightly_studio/dist_lightly_studio_view_app/_app/immutable/nodes/9.Bkrv-Vww.js +0 -1
  133. lightly_studio/resolvers/metadata_resolver/sample/bulk_set_metadata.py +0 -48
  134. lightly_studio/selection/README.md +0 -6
  135. lightly_studio-0.3.3.dist-info/METADATA +0 -814
  136. /lightly_studio/dist_lightly_studio_view_app/_app/immutable/nodes/{11.CWG1ehzT.js → 12.CWG1ehzT.js} +0 -0
  137. {lightly_studio-0.3.3.dist-info → lightly_studio-0.4.0.dist-info}/WHEEL +0 -0
lightly_studio/api/app.py CHANGED
@@ -16,6 +16,7 @@ from lightly_studio.api.routes import healthz, images, webapp
16
16
  from lightly_studio.api.routes.api import (
17
17
  annotation,
18
18
  annotation_label,
19
+ caption,
19
20
  classifier,
20
21
  dataset,
21
22
  dataset_tag,
@@ -89,6 +90,7 @@ api_router.include_router(export.export_router)
89
90
  api_router.include_router(sample.samples_router)
90
91
  api_router.include_router(annotation_label.annotations_label_router)
91
92
  api_router.include_router(annotation.annotations_router)
93
+ api_router.include_router(caption.captions_router)
92
94
  api_router.include_router(text_embedding.text_embedding_router)
93
95
  api_router.include_router(settings.settings_router)
94
96
  api_router.include_router(classifier.classifier_router)
@@ -1,7 +1,5 @@
1
- """Global list fo the active features."""
1
+ """Global list of the active features."""
2
2
 
3
- # TODO(Kondrat 04/25): Pass the feature flag to app
4
- # https://linear.app/lightly/issue/LIG-6708/introduce-apifeatures-endpoint
5
- from typing import List
3
+ from __future__ import annotations
6
4
 
7
- lightly_studio_active_features: List[str] = []
5
+ lightly_studio_active_features: list[str] = []
@@ -0,0 +1,30 @@
1
+ """API routes for dataset captions."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from uuid import UUID
6
+
7
+ from fastapi import APIRouter, Depends, Path
8
+ from typing_extensions import Annotated
9
+
10
+ from lightly_studio.api.routes.api.validators import Paginated, PaginatedWithCursor
11
+ from lightly_studio.db_manager import SessionDep
12
+ from lightly_studio.models.caption import CaptionsListView
13
+ from lightly_studio.resolvers import caption_resolver
14
+ from lightly_studio.resolvers.caption_resolver import GetAllCaptionsResult
15
+
16
+ captions_router = APIRouter(prefix="/datasets/{dataset_id}", tags=["captions"])
17
+
18
+
19
+ @captions_router.get("/captions", response_model=CaptionsListView)
20
+ def read_captions(
21
+ dataset_id: Annotated[UUID, Path(title="Dataset Id")],
22
+ session: SessionDep,
23
+ pagination: Annotated[PaginatedWithCursor, Depends()],
24
+ ) -> GetAllCaptionsResult:
25
+ """Retrieve captions for a dataset."""
26
+ return caption_resolver.get_all(
27
+ session=session,
28
+ dataset_id=dataset_id,
29
+ pagination=Paginated(offset=pagination.offset, limit=pagination.limit),
30
+ )
@@ -164,6 +164,11 @@ class SampleIdsBody(BaseModel):
164
164
  )
165
165
  def add_sample_ids_to_tag_id(
166
166
  session: SessionDep,
167
+ # dataset_id is needed for the generator
168
+ dataset_id: Annotated[ # noqa: ARG001
169
+ UUID,
170
+ Path(title="Dataset Id", description="The ID of the dataset"),
171
+ ],
167
172
  tag_id: UUID,
168
173
  body: SampleIdsBody,
169
174
  ) -> bool:
@@ -215,6 +220,11 @@ class AnnotationIdsBody(BaseModel):
215
220
  )
216
221
  def add_annotation_ids_to_tag_id(
217
222
  session: SessionDep,
223
+ # dataset_id is needed for the generator
224
+ dataset_id: Annotated[ # noqa: ARG001
225
+ UUID,
226
+ Path(title="Dataset Id", description="The ID of the dataset"),
227
+ ],
218
228
  tag_id: UUID,
219
229
  body: AnnotationIdsBody,
220
230
  ) -> bool:
@@ -3,25 +3,38 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import io
6
+ from uuid import UUID
6
7
 
7
- import numpy as np
8
8
  import pyarrow as pa
9
9
  from fastapi import APIRouter, HTTPException, Response
10
- from numpy.typing import NDArray
11
10
  from pyarrow import ipc
12
- from sklearn.manifold import TSNE
11
+ from pydantic import BaseModel, Field
13
12
  from sqlmodel import select
14
13
 
15
14
  from lightly_studio.db_manager import SessionDep
16
15
  from lightly_studio.models.dataset import DatasetTable
17
16
  from lightly_studio.models.embedding_model import EmbeddingModelTable
18
- from lightly_studio.resolvers import sample_embedding_resolver
17
+ from lightly_studio.resolvers import sample_embedding_resolver, sample_resolver
18
+ from lightly_studio.resolvers.samples_filter import SampleFilter
19
+ from lightly_studio.resolvers.twodim_embedding_resolver import _calculate_2d_embeddings
19
20
 
20
21
  embeddings2d_router = APIRouter()
21
22
 
22
23
 
23
- @embeddings2d_router.get("/embeddings2d/tsne")
24
- def get_embeddings2d__tsne(session: SessionDep) -> Response:
24
+ class GetEmbeddings2DRequest(BaseModel):
25
+ """Request body for retrieving 2D embeddings."""
26
+
27
+ filters: SampleFilter | None = Field(
28
+ None,
29
+ description="Filter parameters identifying matching samples",
30
+ )
31
+
32
+
33
+ @embeddings2d_router.post("/embeddings2d/default")
34
+ def get_2d_embeddings(
35
+ session: SessionDep,
36
+ body: GetEmbeddings2DRequest | None = None,
37
+ ) -> Response:
25
38
  """Return 2D embeddings serialized as an Arrow stream."""
26
39
  # TODO(Malte, 09/2025): Support choosing the dataset via API parameter.
27
40
  dataset = session.exec(select(DatasetTable).limit(1)).first()
@@ -37,17 +50,33 @@ def get_embeddings2d__tsne(session: SessionDep) -> Response:
37
50
  if embedding_model is None:
38
51
  raise HTTPException(status_code=404, detail="No embedding model configured.")
39
52
 
40
- # TODO(Malte, 09/2025): Support choosing a subset of samples via API parameter.
41
53
  embeddings = sample_embedding_resolver.get_all_by_dataset_id(
42
54
  session=session,
43
55
  dataset_id=dataset.dataset_id,
44
56
  embedding_model_id=embedding_model.embedding_model_id,
45
57
  )
46
58
 
47
- embedding_values = np.asarray([e.embedding for e in embeddings], dtype=np.float32)
48
- embedding_values_tsne = _calculate_tsne_embeddings(embedding_values)
49
- x = embedding_values_tsne[:, 0]
50
- y = embedding_values_tsne[:, 1]
59
+ planar_embeddings = _calculate_2d_embeddings([e.embedding for e in embeddings])
60
+ # TODO(lukas 10/2025): This can become very slow for large datasets, we should consider using
61
+ # `pa.chunked_array` to directly construct the xy array.
62
+ x = (x for (x, _) in planar_embeddings)
63
+ y = (y for (_, y) in planar_embeddings)
64
+
65
+ matching_sample_ids: set[UUID] | None = None
66
+ filters = body.filters if body else None
67
+ if filters:
68
+ matching_samples_result = sample_resolver.get_all_by_dataset_id(
69
+ session=session,
70
+ dataset_id=dataset.dataset_id,
71
+ filters=filters,
72
+ )
73
+ matching_sample_ids = {sample.sample_id for sample in matching_samples_result.samples}
74
+
75
+ sample_ids = [embedding.sample_id for embedding in embeddings]
76
+ if matching_sample_ids is None:
77
+ fulfils_filter = [1] * len(sample_ids)
78
+ else:
79
+ fulfils_filter = [1 if sample_id in matching_sample_ids else 0 for sample_id in sample_ids]
51
80
 
52
81
  # TODO(Malte, 09/2025): Save the 2D-embeddings in the database to avoid recomputing
53
82
  # them on every request.
@@ -57,6 +86,8 @@ def get_embeddings2d__tsne(session: SessionDep) -> Response:
57
86
  {
58
87
  "x": pa.array(x, type=pa.float32()),
59
88
  "y": pa.array(y, type=pa.float32()),
89
+ "fulfils_filter": pa.array(fulfils_filter, type=pa.uint8()),
90
+ "sample_id": pa.array([str(sample_id) for sample_id in sample_ids], type=pa.string()),
60
91
  }
61
92
  )
62
93
 
@@ -74,31 +105,3 @@ def get_embeddings2d__tsne(session: SessionDep) -> Response:
74
105
  "X-Content-Type-Options": "nosniff",
75
106
  },
76
107
  )
77
-
78
-
79
- def _calculate_tsne_embeddings(embedding_values: NDArray[np.float32]) -> NDArray[np.float32]:
80
- # TODO(Malte, 10/2025): Switch to a better and faster projection method than
81
- # scikit-learn's TSNE.
82
- # See https://linear.app/lightly/issue/LIG-7678/embedding-plot-investigate-fasterandbetter-2d-computation-options
83
- n_samples = embedding_values.shape[0]
84
- # For 0, 1 or 2 samples we hard-code deterministic coordinates.
85
- if n_samples == 0:
86
- return np.zeros((0, 2), dtype=np.float32)
87
- if n_samples == 1:
88
- return np.asarray([[0.0, 0.0]], dtype=np.float32)
89
- if n_samples == 2: # noqa: PLR2004
90
- return np.asarray([[0.0, 0.0], [1.0, 1.0]], dtype=np.float32)
91
-
92
- # Copied from lightly-core:
93
- # https://github.com/lightly-ai/lightly-core/blob/b738952516e916eba42fdd28498491ff18df5c1e/appv2/packages/queueworker/src/jobs/embeddings2d/function-source/main.py#L179-L186
94
- embeddings_2d: NDArray[np.float32] = TSNE(
95
- init="pca", # changed in https://github.com/scikit-learn/scikit-learn/issues/18018
96
- learning_rate="auto", # changed in https://github.com/scikit-learn/scikit-learn/issues/18018
97
- n_components=2,
98
- # Perplexity must be _less_ than the number of entries. 30 is the default value.
99
- # https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html
100
- perplexity=min(30.0, float(n_samples - 1)),
101
- # Make the computation deterministic.
102
- random_state=0,
103
- ).fit_transform(embedding_values)
104
- return embeddings_2d
@@ -5,11 +5,16 @@ from __future__ import annotations
5
5
  from typing import List
6
6
  from uuid import UUID
7
7
 
8
- from fastapi import APIRouter, Path
8
+ from fastapi import APIRouter, Depends, Path
9
+ from pydantic import BaseModel, Field
9
10
  from typing_extensions import Annotated
10
11
 
12
+ from lightly_studio.api.routes.api.dataset import get_and_validate_dataset_id
11
13
  from lightly_studio.db_manager import SessionDep
14
+ from lightly_studio.metadata import compute_typicality
15
+ from lightly_studio.models.dataset import DatasetTable
12
16
  from lightly_studio.models.metadata import MetadataInfoView
17
+ from lightly_studio.resolvers import embedding_model_resolver
13
18
  from lightly_studio.resolvers.metadata_resolver.sample.get_metadata_info import (
14
19
  get_all_metadata_keys_and_schema,
15
20
  )
@@ -33,3 +38,54 @@ def get_metadata_info(
33
38
  for numerical metadata types.
34
39
  """
35
40
  return get_all_metadata_keys_and_schema(session=session, dataset_id=dataset_id)
41
+
42
+
43
+ class ComputeTypicalityRequest(BaseModel):
44
+ """Request model for computing typicality metadata."""
45
+
46
+ embedding_model_name: str | None = Field(
47
+ default=None,
48
+ description="Embedding model name (uses default if not specified)",
49
+ )
50
+ metadata_name: str = Field(
51
+ default="typicality",
52
+ description="Metadata field name (defaults to 'typicality')",
53
+ )
54
+
55
+
56
+ @metadata_router.post(
57
+ "/metadata/typicality",
58
+ status_code=204,
59
+ response_model=None,
60
+ )
61
+ def compute_typicality_metadata(
62
+ session: SessionDep,
63
+ dataset: Annotated[
64
+ DatasetTable,
65
+ Depends(get_and_validate_dataset_id),
66
+ ],
67
+ request: ComputeTypicalityRequest,
68
+ ) -> None:
69
+ """Compute typicality metadata for a dataset.
70
+
71
+ Args:
72
+ session: The database session.
73
+ dataset: The dataset to compute typicality for.
74
+ request: Request parameters including optional embedding model name
75
+ and metadata field name.
76
+
77
+ Returns:
78
+ None (204 No Content on success).
79
+ """
80
+ embedding_model = embedding_model_resolver.get_by_name(
81
+ session=session,
82
+ dataset_id=dataset.dataset_id,
83
+ embedding_model_name=request.embedding_model_name,
84
+ )
85
+
86
+ compute_typicality.compute_typicality_metadata(
87
+ session=session,
88
+ dataset_id=dataset.dataset_id,
89
+ embedding_model_id=embedding_model.embedding_model_id,
90
+ metadata_name=request.metadata_name,
91
+ )
@@ -2,6 +2,8 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import json
6
+ from collections import defaultdict
5
7
  from dataclasses import dataclass, field
6
8
  from pathlib import Path
7
9
  from typing import Iterable
@@ -26,10 +28,12 @@ from tqdm import tqdm
26
28
 
27
29
  from lightly_studio.models.annotation.annotation_base import AnnotationCreate
28
30
  from lightly_studio.models.annotation_label import AnnotationLabelCreate
31
+ from lightly_studio.models.caption import CaptionCreate
29
32
  from lightly_studio.models.sample import SampleCreate, SampleTable
30
33
  from lightly_studio.resolvers import (
31
34
  annotation_label_resolver,
32
35
  annotation_resolver,
36
+ caption_resolver,
33
37
  sample_resolver,
34
38
  )
35
39
 
@@ -218,6 +222,111 @@ def load_into_dataset_from_labelformat(
218
222
  return created_sample_ids
219
223
 
220
224
 
225
+ def load_into_dataset_from_coco_captions(
226
+ session: Session,
227
+ dataset_id: UUID,
228
+ annotations_json: Path,
229
+ images_path: Path,
230
+ ) -> list[UUID]:
231
+ """Load samples and captions from a COCO captions file into the dataset.
232
+
233
+ Args:
234
+ session: Database session used for resolver operations.
235
+ dataset_id: Identifier of the dataset that receives the samples.
236
+ annotations_json: Path to the COCO captions annotations file.
237
+ images_path: Directory containing the referenced images.
238
+
239
+ Returns:
240
+ The list of newly created sample identifiers.
241
+ """
242
+ with fsspec.open(str(annotations_json), "r") as file:
243
+ coco_payload = json.load(file)
244
+
245
+ images: list[dict[str, object]] = coco_payload.get("images", [])
246
+ annotations: list[dict[str, object]] = coco_payload.get("annotations", [])
247
+
248
+ captions_by_image_id: dict[int, list[str]] = defaultdict(list)
249
+ for annotation in annotations:
250
+ image_id = annotation["image_id"]
251
+ caption = annotation["caption"]
252
+ if not isinstance(image_id, int):
253
+ continue
254
+ if not isinstance(caption, str):
255
+ continue
256
+ caption_text = caption.strip()
257
+ if not caption_text:
258
+ continue
259
+ captions_by_image_id[image_id].append(caption_text)
260
+
261
+ logging_context = _LoadingLoggingContext(
262
+ n_samples_to_be_inserted=len(images),
263
+ n_samples_before_loading=sample_resolver.count_by_dataset_id(
264
+ session=session, dataset_id=dataset_id
265
+ ),
266
+ )
267
+
268
+ captions_to_create: list[CaptionCreate] = []
269
+ samples_to_create: list[SampleCreate] = []
270
+ created_sample_ids: list[UUID] = []
271
+ image_path_to_captions: dict[str, list[str]] = {}
272
+
273
+ for image_info in tqdm(images, desc="Processing images", unit=" images"):
274
+ if isinstance(image_info["id"], int):
275
+ image_id_raw = image_info["id"]
276
+ else:
277
+ continue
278
+ file_name_raw = str(image_info["file_name"])
279
+
280
+ width = image_info["width"] if isinstance(image_info["width"], int) else 0
281
+ height = image_info["height"] if isinstance(image_info["height"], int) else 0
282
+ sample = SampleCreate(
283
+ file_name=file_name_raw,
284
+ file_path_abs=str(images_path / file_name_raw),
285
+ width=width,
286
+ height=height,
287
+ dataset_id=dataset_id,
288
+ )
289
+ samples_to_create.append(sample)
290
+ image_path_to_captions[sample.file_path_abs] = captions_by_image_id.get(image_id_raw, [])
291
+
292
+ if len(samples_to_create) >= SAMPLE_BATCH_SIZE:
293
+ created_samples_batch, paths_not_inserted = _create_batch_samples(
294
+ session=session, samples=samples_to_create
295
+ )
296
+ created_sample_ids.extend(s.sample_id for s in created_samples_batch)
297
+ logging_context.update_example_paths(paths_not_inserted)
298
+ _process_batch_captions(
299
+ session=session,
300
+ dataset_id=dataset_id,
301
+ stored_samples=created_samples_batch,
302
+ image_path_to_captions=image_path_to_captions,
303
+ captions_to_create=captions_to_create,
304
+ )
305
+ samples_to_create.clear()
306
+ image_path_to_captions.clear()
307
+
308
+ if samples_to_create:
309
+ created_samples_batch, paths_not_inserted = _create_batch_samples(
310
+ session=session, samples=samples_to_create
311
+ )
312
+ created_sample_ids.extend(s.sample_id for s in created_samples_batch)
313
+ logging_context.update_example_paths(paths_not_inserted)
314
+ _process_batch_captions(
315
+ session=session,
316
+ dataset_id=dataset_id,
317
+ stored_samples=created_samples_batch,
318
+ image_path_to_captions=image_path_to_captions,
319
+ captions_to_create=captions_to_create,
320
+ )
321
+
322
+ if captions_to_create:
323
+ caption_resolver.create_many(session=session, captions=captions_to_create)
324
+
325
+ _log_loading_results(session=session, dataset_id=dataset_id, logging_context=logging_context)
326
+
327
+ return created_sample_ids
328
+
329
+
221
330
  def _log_loading_results(
222
331
  session: Session, dataset_id: UUID, logging_context: _LoadingLoggingContext
223
332
  ) -> None:
@@ -372,3 +481,32 @@ def _process_batch_annotations( # noqa: PLR0913
372
481
  if len(annotations_to_create) >= ANNOTATION_BATCH_SIZE:
373
482
  annotation_resolver.create_many(session=session, annotations=annotations_to_create)
374
483
  annotations_to_create.clear()
484
+
485
+
486
+ def _process_batch_captions(
487
+ session: Session,
488
+ dataset_id: UUID,
489
+ stored_samples: list[SampleTable],
490
+ image_path_to_captions: dict[str, list[str]],
491
+ captions_to_create: list[CaptionCreate],
492
+ ) -> None:
493
+ """Process captions for a batch of samples."""
494
+ if not stored_samples:
495
+ return
496
+
497
+ for stored_sample in stored_samples:
498
+ captions = image_path_to_captions[stored_sample.file_path_abs]
499
+ if not captions:
500
+ continue
501
+
502
+ for caption_text in captions:
503
+ caption = CaptionCreate(
504
+ dataset_id=dataset_id,
505
+ sample_id=stored_sample.sample_id,
506
+ text=caption_text,
507
+ )
508
+ captions_to_create.append(caption)
509
+
510
+ if len(captions_to_create) >= ANNOTATION_BATCH_SIZE:
511
+ caption_resolver.create_many(session=session, captions=captions_to_create)
512
+ captions_to_create.clear()