neuromeka-vfm 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,122 @@
1
+ """
2
+ Utility to generate simple parametric meshes (currently rectangular box) as binary STL.
3
+
4
+ Design goals
5
+ - Units: meters
6
+ - Origin: object center at (0, 0, 0)
7
+ - Axes: faces aligned to +/-X, +/-Y, +/-Z
8
+ - Output: binary STL saved to /opt/meshes (docker volume mount)
9
+
10
+ Usage (programmatic):
11
+ from backend.generate_mesh import write_box_stl
12
+ path = write_box_stl("custom_box.stl", width=0.054, depth=0.097, height=0.054)
13
+
14
+ CLI (optional):
15
+ python -m backend.generate_mesh box custom_box.stl 0.054 0.097 0.054
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import struct
21
+ import sys
22
+ from pathlib import Path
23
+ from typing import Iterable, Tuple
24
+
25
+ MESH_DIR = Path("/opt/meshes")
26
+
27
+
28
+ def _ensure_dir(path: Path) -> None:
29
+ path.parent.mkdir(parents=True, exist_ok=True)
30
+
31
+
32
+ def _pack_triangle(normal: Iterable[float], v1: Iterable[float], v2: Iterable[float], v3: Iterable[float]) -> bytes:
33
+ """Pack one triangle (normal + 3 vertices) into binary STL record."""
34
+ return struct.pack(
35
+ "<12fH",
36
+ *normal,
37
+ *v1,
38
+ *v2,
39
+ *v3,
40
+ 0, # attribute byte count
41
+ )
42
+
43
+
44
+ def _box_triangles(width: float, depth: float, height: float):
45
+ """Generate normals and vertices for a box centered at origin."""
46
+ hx, hy, hz = width / 2.0, depth / 2.0, height / 2.0
47
+ # 8 vertices
48
+ p = [
49
+ (-hx, -hy, -hz),
50
+ (hx, -hy, -hz),
51
+ (hx, hy, -hz),
52
+ (-hx, hy, -hz),
53
+ (-hx, -hy, hz),
54
+ (hx, -hy, hz),
55
+ (hx, hy, hz),
56
+ (-hx, hy, hz),
57
+ ]
58
+ # Each face: two triangles (ccw when looking from outside)
59
+ faces = [
60
+ ((-1, 0, 0), (0, 1, 3, 7, 4)), # -X
61
+ ((1, 0, 0), (1, 2, 6, 5)), # +X
62
+ ((0, -1, 0), (0, 1, 5, 4)), # -Y
63
+ ((0, 1, 0), (3, 2, 6, 7)), # +Y
64
+ ((0, 0, -1), (0, 1, 2, 3)), # -Z
65
+ ((0, 0, 1), (4, 5, 6, 7)), # +Z
66
+ ]
67
+ for normal, idx in faces:
68
+ if len(idx) == 4:
69
+ a, b, c, d = idx
70
+ # two triangles: (a,b,c) and (a,c,d)
71
+ yield normal, p[a], p[b], p[c]
72
+ yield normal, p[a], p[c], p[d]
73
+ else:
74
+ raise ValueError("Face index must have 4 vertices.")
75
+
76
+
77
+ def write_box_stl(filename: str, width: float, depth: float, height: float) -> Path:
78
+ """
79
+ Create a rectangular box STL.
80
+
81
+ Args:
82
+ filename: output file name (placed under /opt/meshes). If only a name is
83
+ given, it is resolved relative to MESH_DIR.
84
+ width, depth, height: box dimensions in meters (must be > 0).
85
+
86
+ Returns:
87
+ Path to the written STL file.
88
+ """
89
+ if width <= 0 or depth <= 0 or height <= 0:
90
+ raise ValueError("width, depth, height must be positive.")
91
+
92
+ out_path = Path(filename)
93
+ if not out_path.is_absolute():
94
+ out_path = MESH_DIR / out_path
95
+ _ensure_dir(out_path)
96
+
97
+ triangles = list(_box_triangles(width, depth, height))
98
+ header = b"rect_box_stl" + b"\0" * (80 - len("rect_box_stl"))
99
+ with out_path.open("wb") as f:
100
+ f.write(header)
101
+ f.write(struct.pack("<I", len(triangles)))
102
+ for tri in triangles:
103
+ f.write(_pack_triangle(*tri))
104
+ return out_path
105
+
106
+
107
+ def _cli(args: list[str]) -> int:
108
+ if len(args) != 5 or args[0].lower() != "box":
109
+ print("Usage: python -m backend.generate_mesh box <filename> <width> <depth> <height>")
110
+ return 1
111
+ _, fname, w, d, h = args
112
+ try:
113
+ path = write_box_stl(fname, float(w), float(d), float(h))
114
+ except Exception as exc: # noqa: BLE001
115
+ print(f"Error: {exc}")
116
+ return 1
117
+ print(f"STL written to: {path}")
118
+ return 0
119
+
120
+
121
+ if __name__ == "__main__":
122
+ sys.exit(_cli(sys.argv[1:]))
@@ -0,0 +1,377 @@
1
+ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # NVIDIA CORPORATION and its licensors retain all intellectual property
4
+ # and proprietary rights in and to this software, related documentation
5
+ # and any modifications thereto. Any use, reproduction, disclosure or
6
+ # distribution of this software and related documentation without an express
7
+ # license agreement from NVIDIA CORPORATION is strictly prohibited.
8
+
9
+ import logging
10
+ from typing import Tuple, Dict
11
+
12
+ import numpy as np
13
+ import trimesh
14
+ import trimesh.transformations as tra
15
+ from tqdm import tqdm
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def _pairwise_distances(X: np.ndarray, Y: np.ndarray, norm: int) -> np.ndarray:
21
+ if norm == 1:
22
+ return np.sum(np.abs(X[:, None, :] - Y[None, :, :]), axis=2)
23
+ if norm == 2:
24
+ diff = X[:, None, :] - Y[None, :, :]
25
+ return np.sqrt(np.sum(diff * diff, axis=2))
26
+ diff = X[:, None, :] - Y[None, :, :]
27
+ return np.linalg.norm(diff, ord=norm, axis=2)
28
+
29
+
30
+ def knn_points(X: np.ndarray, K: int, norm: int):
31
+ """
32
+ Computes the K-nearest neighbors for each point in the point cloud X.
33
+
34
+ Args:
35
+ X: (N, 3) array representing the point cloud.
36
+ K: Number of nearest neighbors.
37
+
38
+ Returns:
39
+ dists: (N, K) array containing distances to the K nearest neighbors.
40
+ idxs: (N, K) array containing indices of the K nearest neighbors.
41
+ """
42
+ X = np.asarray(X, dtype=np.float32)
43
+ if X.ndim != 2 or X.shape[1] != 3:
44
+ raise ValueError("X must be a (N, 3) array")
45
+ if K <= 0:
46
+ raise ValueError("K must be positive")
47
+ N, _ = X.shape
48
+ if K >= N:
49
+ raise ValueError("K must be smaller than number of points")
50
+
51
+ dists_out = np.empty((N, K), dtype=np.float32)
52
+ idxs_out = np.empty((N, K), dtype=np.int64)
53
+
54
+ max_bytes = 64 * 1024 * 1024
55
+ bytes_per_row = N * X.dtype.itemsize
56
+ chunk_size = max(1, min(N, max_bytes // max(bytes_per_row, 1)))
57
+
58
+ for start in range(0, N, chunk_size):
59
+ end = min(start + chunk_size, N)
60
+ chunk = X[start:end]
61
+ dist_matrix = _pairwise_distances(chunk, X, norm=norm)
62
+
63
+ row_idx = np.arange(end - start)
64
+ col_idx = row_idx + start
65
+ dist_matrix[row_idx, col_idx] = np.inf
66
+
67
+ idx_part = np.argpartition(dist_matrix, K, axis=1)[:, :K]
68
+ dist_part = np.take_along_axis(dist_matrix, idx_part, axis=1)
69
+ order = np.argsort(dist_part, axis=1)
70
+ idxs = np.take_along_axis(idx_part, order, axis=1)
71
+ dists = np.take_along_axis(dist_part, order, axis=1)
72
+
73
+ dists_out[start:end] = dists
74
+ idxs_out[start:end] = idxs
75
+
76
+ return dists_out, idxs_out
77
+
78
+
79
+ def point_cloud_outlier_removal(
80
+ obj_pc: np.ndarray, threshold: float = 0.014, K: int = 20
81
+ ) -> Tuple[np.ndarray, np.ndarray]:
82
+ """
83
+ Remove outliers from a point cloud. K-nearest neighbors is used to compute
84
+ the distance to the nearest neighbor for each point. If the distance is
85
+ greater than a threshold, the point is considered an outlier and removed.
86
+
87
+ Args:
88
+ obj_pc (np.ndarray): (N, 3) array representing the point cloud.
89
+ threshold (float): Distance threshold for outlier detection. Points with mean distance to
90
+ K nearest neighbors greater than this threshold are removed.
91
+ K (int): Number of nearest neighbors to consider for outlier detection.
92
+
93
+ Returns:
94
+ Tuple[np.ndarray, np.ndarray]: Tuple containing filtered and removed point clouds.
95
+ """
96
+ obj_pc = np.asarray(obj_pc, dtype=np.float32)
97
+ if obj_pc.ndim != 2 or obj_pc.shape[1] != 3:
98
+ raise ValueError("obj_pc must be a (N, 3) array")
99
+
100
+ nn_dists, _ = knn_points(obj_pc, K=K, norm=1)
101
+
102
+ mask = nn_dists.mean(axis=1) < threshold
103
+ filtered_pc = obj_pc[mask]
104
+ removed_pc = obj_pc[~mask]
105
+
106
+ logger.info(
107
+ "Removed %s points from point cloud",
108
+ obj_pc.shape[0] - filtered_pc.shape[0],
109
+ )
110
+ return filtered_pc, removed_pc
111
+
112
+
113
+ def point_cloud_outlier_removal_with_color(
114
+ obj_pc: np.ndarray,
115
+ obj_pc_color: np.ndarray,
116
+ threshold: float = 0.014,
117
+ K: int = 20,
118
+ ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
119
+ """
120
+ Remove outliers from a point cloud with colors.
121
+
122
+ Args:
123
+ obj_pc (np.ndarray): (N, 3) array representing the point cloud.
124
+ obj_pc_color (np.ndarray): (N, 3) array representing the point cloud color.
125
+ threshold (float): Distance threshold for outlier detection. Points with mean distance to
126
+ K nearest neighbors greater than this threshold are removed.
127
+ K (int): Number of nearest neighbors to consider for outlier detection.
128
+
129
+ Returns:
130
+ Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: Tuple containing filtered and
131
+ removed point clouds and colors.
132
+ """
133
+ obj_pc = np.asarray(obj_pc, dtype=np.float32)
134
+ obj_pc_color = np.asarray(obj_pc_color, dtype=np.float32)
135
+ if obj_pc.ndim != 2 or obj_pc.shape[1] != 3:
136
+ raise ValueError("obj_pc must be a (N, 3) array")
137
+ if obj_pc_color.shape != obj_pc.shape:
138
+ raise ValueError("obj_pc_color must match obj_pc shape")
139
+
140
+ nn_dists, _ = knn_points(obj_pc, K=K, norm=1)
141
+
142
+ mask = nn_dists.mean(axis=1) < threshold
143
+ filtered_pc = obj_pc[mask]
144
+ removed_pc = obj_pc[~mask]
145
+
146
+ filtered_pc_color = obj_pc_color[mask]
147
+ removed_pc_color = obj_pc_color[~mask]
148
+
149
+ logger.info(
150
+ "Removed %s points from point cloud",
151
+ obj_pc.shape[0] - filtered_pc.shape[0],
152
+ )
153
+ return filtered_pc, removed_pc, filtered_pc_color, removed_pc_color
154
+
155
+
156
+ def depth2points(
157
+ depth: np.array,
158
+ fx: int,
159
+ fy: int,
160
+ cx: int,
161
+ cy: int,
162
+ xmap: np.array = None,
163
+ ymap: np.array = None,
164
+ rgb: np.array = None,
165
+ seg: np.array = None,
166
+ mask: np.arange = None,
167
+ ) -> Dict:
168
+ """Compute point cloud from a depth image."""
169
+ if rgb is not None:
170
+ assert rgb.shape[0] == depth.shape[0] and rgb.shape[1] == depth.shape[1]
171
+ if xmap is not None:
172
+ assert xmap.shape[0] == depth.shape[0] and xmap.shape[1] == depth.shape[1]
173
+ if ymap is not None:
174
+ assert ymap.shape[0] == depth.shape[0] and ymap.shape[1] == depth.shape[1]
175
+
176
+ im_height, im_width = depth.shape[0], depth.shape[1]
177
+
178
+ if xmap is None or ymap is None:
179
+ ww = np.linspace(0, im_width - 1, im_width)
180
+ hh = np.linspace(0, im_height - 1, im_height)
181
+ xmap, ymap = np.meshgrid(ww, hh)
182
+
183
+ pt2 = depth
184
+ pt0 = (xmap - cx) * pt2 / fx
185
+ pt1 = (ymap - cy) * pt2 / fy
186
+
187
+ mask_depth = np.ma.getmaskarray(np.ma.masked_greater(pt2, 0))
188
+ if mask is None:
189
+ mask = mask_depth
190
+ else:
191
+ mask_semantic = np.ma.getmaskarray(np.ma.masked_equal(mask, 1))
192
+ mask = mask_depth * mask_semantic
193
+
194
+ index = mask.flatten().nonzero()[0]
195
+
196
+ pt2_valid = pt2.flatten()[:, np.newaxis].astype(np.float32)
197
+ pt0_valid = pt0.flatten()[:, np.newaxis].astype(np.float32)
198
+ pt1_valid = pt1.flatten()[:, np.newaxis].astype(np.float32)
199
+ pc_xyz = np.concatenate((pt0_valid, pt1_valid, pt2_valid), axis=1)
200
+ if rgb is not None:
201
+ r = rgb[:, :, 0].flatten()[:, np.newaxis]
202
+ g = rgb[:, :, 1].flatten()[:, np.newaxis]
203
+ b = rgb[:, :, 2].flatten()[:, np.newaxis]
204
+ pc_rgb = np.concatenate((r, g, b), axis=1)
205
+ else:
206
+ pc_rgb = None
207
+
208
+ if seg is not None:
209
+ pc_seg = seg.flatten()[:, np.newaxis]
210
+ else:
211
+ pc_seg = None
212
+
213
+ return {"xyz": pc_xyz, "rgb": pc_rgb, "seg": pc_seg, "index": index}
214
+
215
+
216
+ def depth_and_segmentation_to_point_clouds(
217
+ depth_image: np.ndarray,
218
+ segmentation_mask: np.ndarray,
219
+ fx: float,
220
+ fy: float,
221
+ cx: float,
222
+ cy: float,
223
+ rgb_image: np.ndarray = None,
224
+ target_object_id: int = 1,
225
+ remove_object_from_scene: bool = False,
226
+ ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
227
+ """
228
+ Convert depth image and instance segmentation mask to scene and object point clouds.
229
+
230
+ Args:
231
+ depth_image: HxW depth image in meters
232
+ segmentation_mask: HxW instance segmentation mask with integer labels
233
+ fx, fy, cx, cy: Camera intrinsic parameters
234
+ rgb_image: HxWx3 RGB image (optional, for colored point clouds)
235
+ target_object_id: ID of the target object in the segmentation mask
236
+ remove_object_from_scene: If True, removes object points from scene point cloud
237
+
238
+ Returns:
239
+ scene_pc: Nx3 point cloud of the entire scene (excluding object if remove_object_from_scene=True)
240
+ object_pc: Mx3 point cloud of the target object only
241
+ scene_colors: Nx3 RGB colors for scene points (or None)
242
+ object_colors: Mx3 RGB colors for object points (or None)
243
+
244
+ Raises:
245
+ ValueError: If no target object found or multiple objects detected
246
+ """
247
+ unique_ids = np.unique(segmentation_mask)
248
+ if target_object_id not in unique_ids:
249
+ raise ValueError(
250
+ f"Target object ID {target_object_id} not found in segmentation mask. Available IDs: {unique_ids}"
251
+ )
252
+
253
+ non_background_ids = unique_ids[unique_ids != 0]
254
+ if len(non_background_ids) > 1:
255
+ raise ValueError(
256
+ "Multiple objects detected in segmentation mask: "
257
+ f"{non_background_ids}. Please ensure only one object is present."
258
+ )
259
+
260
+ pts_data = depth2points(
261
+ depth=depth_image,
262
+ fx=int(fx),
263
+ fy=int(fy),
264
+ cx=int(cx),
265
+ cy=int(cy),
266
+ rgb=rgb_image,
267
+ seg=segmentation_mask,
268
+ )
269
+
270
+ xyz = pts_data["xyz"]
271
+ rgb = pts_data["rgb"]
272
+ seg = pts_data["seg"]
273
+ index = pts_data["index"]
274
+
275
+ xyz_valid = xyz[index]
276
+ seg_valid = seg[index] if seg is not None else None
277
+ rgb_valid = rgb[index] if rgb is not None else None
278
+
279
+ scene_pc = xyz_valid
280
+ scene_colors = rgb_valid
281
+
282
+ if seg_valid is not None:
283
+ object_mask = seg_valid.flatten() == target_object_id
284
+ object_pc = xyz_valid[object_mask]
285
+ object_colors = rgb_valid[object_mask] if rgb_valid is not None else None
286
+
287
+ if remove_object_from_scene:
288
+ scene_mask = ~object_mask
289
+ scene_pc = xyz_valid[scene_mask]
290
+ scene_colors = rgb_valid[scene_mask] if rgb_valid is not None else None
291
+ logger.info(
292
+ "Removed %s object points from scene point cloud",
293
+ np.sum(object_mask),
294
+ )
295
+ else:
296
+ raise ValueError("Segmentation data not available from depth2points")
297
+
298
+ if len(object_pc) == 0:
299
+ raise ValueError(f"No points found for target object ID {target_object_id}")
300
+
301
+ logger.info("Scene point cloud: %s points", len(scene_pc))
302
+ logger.info("Object point cloud: %s points", len(object_pc))
303
+
304
+ return scene_pc, object_pc, scene_colors, object_colors
305
+
306
+
307
+ def filter_colliding_grasps(
308
+ scene_pc: np.ndarray,
309
+ grasp_poses: np.ndarray,
310
+ gripper_collision_mesh: trimesh.Trimesh,
311
+ collision_threshold: float = 0.002,
312
+ num_collision_samples: int = 2000,
313
+ ) -> np.ndarray:
314
+ """
315
+ Filter grasps based on collision detection with scene point cloud.
316
+
317
+ Args:
318
+ scene_pc: Nx3 scene point cloud
319
+ grasp_poses: Kx4x4 array of grasp poses
320
+ gripper_collision_mesh: Trimesh of gripper collision geometry
321
+ collision_threshold: Distance threshold for collision detection (meters)
322
+ num_collision_samples: Number of points to sample from gripper mesh surface
323
+
324
+ Returns:
325
+ collision_mask: K-length boolean array, True if grasp is collision-free
326
+ """
327
+ gripper_surface_points, _ = trimesh.sample.sample_surface(
328
+ gripper_collision_mesh, num_collision_samples
329
+ )
330
+ gripper_surface_points = np.array(gripper_surface_points)
331
+
332
+ scene_pc = np.asarray(scene_pc, dtype=np.float32)
333
+ collision_free_mask = []
334
+
335
+ logger.info(
336
+ "Checking collision for %s grasps against %s scene points...",
337
+ len(grasp_poses),
338
+ len(scene_pc),
339
+ )
340
+
341
+ for _, grasp_pose in tqdm(
342
+ enumerate(grasp_poses), total=len(grasp_poses), desc="Collision checking"
343
+ ):
344
+ gripper_points_transformed = tra.transform_points(
345
+ gripper_surface_points, grasp_pose
346
+ ).astype(np.float32, copy=False)
347
+
348
+ min_distances_sq = []
349
+ batch_size = 100
350
+ for j in range(0, len(gripper_points_transformed), batch_size):
351
+ batch_gripper_points = gripper_points_transformed[j : j + batch_size]
352
+ diff = batch_gripper_points[:, None, :] - scene_pc[None, :, :]
353
+ dist_sq = np.einsum("ijk,ijk->ij", diff, diff)
354
+ batch_min_dist_sq = np.min(dist_sq, axis=1)
355
+ min_distances_sq.append(batch_min_dist_sq)
356
+
357
+ all_min_distances_sq = np.concatenate(min_distances_sq, axis=0)
358
+ collision_detected = np.any(
359
+ all_min_distances_sq < collision_threshold * collision_threshold
360
+ )
361
+ collision_free_mask.append(not bool(collision_detected))
362
+
363
+ collision_free_mask = np.array(collision_free_mask)
364
+ num_collision_free = np.sum(collision_free_mask)
365
+ logger.info("Found %s/%s collision-free grasps", num_collision_free, len(grasp_poses))
366
+
367
+ return collision_free_mask
368
+
369
+
370
+ __all__ = [
371
+ "knn_points",
372
+ "point_cloud_outlier_removal",
373
+ "point_cloud_outlier_removal_with_color",
374
+ "depth2points",
375
+ "depth_and_segmentation_to_point_clouds",
376
+ "filter_colliding_grasps",
377
+ ]
@@ -16,6 +16,7 @@ class Segmentation:
16
16
  self.client = PickleClient(hostname, port)
17
17
  self.tracking_object_ids = []
18
18
  self.current_frame_masks = {}
19
+ self.invisible_object_ids = []
19
20
  self.image_prompt_names = set()
20
21
  if compression_strategy in STRATEGIES:
21
22
  self.compression_strategy_name = compression_strategy
@@ -51,14 +52,28 @@ class Segmentation:
51
52
  else:
52
53
  raise ValueError(f"Only valid compression strategies are {list(STRATEGIES.keys())}")
53
54
 
55
+ def set_config(self, config):
56
+ data = {"operation": "set_config", "config": config}
57
+ return self.client.send_data(data)
58
+
59
+ def get_capabilities(self):
60
+ data = {"operation": "get_capabilities"}
61
+ return self.client.send_data(data)
62
+
63
+ def get_config(self):
64
+ data = {"operation": "get_config"}
65
+ return self.client.send_data(data)
66
+
54
67
  def reset(self):
55
68
  self.first_frame_registered = False
56
69
  self.tracking_object_ids = []
57
70
  self.current_frame_masks = {}
71
+ self.invisible_object_ids = []
58
72
  self.encoder = None
59
73
  if self.benchmark:
60
74
  self.call_time = {"add_image_prompt": 0, "register_first_frame": 0, "get_next": 0}
61
75
  self.call_count = {"add_image_prompt": 0, "register_first_frame": 0, "get_next": 0}
76
+ self.client.send_data({"operation": "reset"})
62
77
 
63
78
  def add_image_prompt(self, object_name, object_image):
64
79
  if self.benchmark:
@@ -100,6 +115,9 @@ class Segmentation:
100
115
  if np.any(mask):
101
116
  masks[obj_id] = mask
102
117
  self.current_frame_masks = masks
118
+ self.invisible_object_ids = [
119
+ obj_id for obj_id in self.tracking_object_ids if obj_id not in masks
120
+ ]
103
121
  if self.benchmark:
104
122
  self.call_time["register_first_frame"] += time.time() - start
105
123
  self.call_count["register_first_frame"] += 1
@@ -124,21 +142,57 @@ class Segmentation:
124
142
  if np.any(mask):
125
143
  masks[obj_id] = mask
126
144
  self.current_frame_masks = masks
145
+ self.invisible_object_ids = [
146
+ obj_id for obj_id in self.tracking_object_ids if obj_id not in masks
147
+ ]
127
148
  if self.benchmark:
128
149
  self.call_time["get_next"] += time.time() - start
129
150
  self.call_count["get_next"] += 1
130
151
  return masks
152
+ if isinstance(response, dict) and any(
153
+ key in response for key in ("result", "status", "success", "message")
154
+ ):
155
+ if self.benchmark:
156
+ self.call_time["get_next"] += time.time() - start
157
+ self.call_count["get_next"] += 1
158
+ return response
131
159
  if self.benchmark:
132
160
  self.call_time["get_next"] += time.time() - start
133
161
  self.call_count["get_next"] += 1
134
162
  return None
135
163
 
164
+ def remove_object(self, obj_id, strict=False, need_output=False):
165
+ if not self.first_frame_registered:
166
+ print("Segmentation: register_first_frame must be called first")
167
+ return None
168
+ data = {
169
+ "operation": "remove_object",
170
+ "obj_id": obj_id,
171
+ "strict": strict,
172
+ "need_output": need_output,
173
+ }
174
+ response = self.client.send_data(data)
175
+ if self._is_success(response):
176
+ obj_ids = response.get("data", {}).get("obj_ids")
177
+ if obj_ids is not None:
178
+ self.tracking_object_ids = obj_ids
179
+ self.current_frame_masks = {
180
+ obj_id: mask
181
+ for obj_id, mask in self.current_frame_masks.items()
182
+ if obj_id in obj_ids
183
+ }
184
+ self.invisible_object_ids = [
185
+ obj_id for obj_id in obj_ids if obj_id not in self.current_frame_masks
186
+ ]
187
+ return response
188
+
136
189
  def finish(self):
137
190
  if not self.first_frame_registered:
138
191
  print("Warning: Segmentation: register_first_frame must be called first")
139
192
  self.first_frame_registered = False
140
193
  self.tracking_object_ids = []
141
194
  self.current_frame_masks = {}
195
+ self.invisible_object_ids = []
142
196
 
143
197
  def close(self):
144
198
  """Close underlying ZeroMQ socket/context."""
@@ -0,0 +1,301 @@
1
+ Metadata-Version: 2.4
2
+ Name: neuromeka_vfm
3
+ Version: 0.1.6
4
+ Summary: Client utilities for Neuromeka VFM FoundationPose RPC (upload meshes, call server)
5
+ Author: Neuromeka
6
+ License: MIT License
7
+
8
+ Copyright (c) 2025 Neuromeka Co., Ltd.
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Classifier: Development Status :: 3 - Alpha
29
+ Classifier: Intended Audience :: Developers
30
+ Classifier: License :: OSI Approved :: MIT License
31
+ Classifier: Programming Language :: Python :: 3
32
+ Classifier: Programming Language :: Python :: 3.8
33
+ Classifier: Programming Language :: Python :: 3.9
34
+ Classifier: Programming Language :: Python :: 3.10
35
+ Classifier: Programming Language :: Python :: 3.11
36
+ Classifier: Programming Language :: Python :: 3.12
37
+ Requires-Python: >=3.8
38
+ Description-Content-Type: text/markdown
39
+ License-File: LICENSE
40
+ Requires-Dist: numpy
41
+ Requires-Dist: pyzmq
42
+ Requires-Dist: paramiko
43
+ Requires-Dist: av
44
+ Requires-Dist: trimesh
45
+ Requires-Dist: tqdm
46
+ Dynamic: license-file
47
+
48
+ # neuromeka_vfm
49
+
50
+ A lightweight client SDK for communicating with Segmentation (SAM2, Grounding DINO) and Pose Estimation (NVIDIA FoundationPose) servers over RPC/ZeroMQ. It also provides SSH/SFTP utilities to upload mesh files to the host.
51
+
52
+ - Website: http://www.neuromeka.com
53
+ - PyPI package: https://pypi.org/project/neuromeka_vfm/
54
+ - Documents: https://docs.neuromeka.com
55
+
56
+ ## Installation
57
+
58
+ ```bash
59
+ pip install neuromeka_vfm
60
+ ```
61
+
62
+ ## Python API (usage by example)
63
+
64
+ - Client PC: the machine running your application with this package installed.
65
+ - Host PC: the machine running Segmentation and Pose Estimation Docker servers. If you run Docker locally, use `localhost`.
66
+
67
+ ### Segmentation
68
+
69
+ ```python
70
+ from neuromeka_vfm import Segmentation
71
+
72
+ seg = Segmentation(
73
+ hostname="192.168.10.63",
74
+ port=5432,
75
+ compression_strategy="png", # none | png | jpeg | h264
76
+ )
77
+
78
+ # Register using an image prompt
79
+ seg.add_image_prompt("drug_box", ref_rgb)
80
+ seg.register_first_frame(
81
+ frame=first_rgb,
82
+ prompt="drug_box", # ID string
83
+ use_image_prompt=True,
84
+ )
85
+
86
+ # Register using a text prompt
87
+ seg.register_first_frame(
88
+ frame=first_rgb,
89
+ prompt="box .", # Text prompt (must end with " .")
90
+ use_image_prompt=False,
91
+ )
92
+
93
+ # SAM2 tracking on the registered mask(s)
94
+ resp = seg.get_next(next_rgb)
95
+ if isinstance(resp, dict) and resp.get("result") == "ERROR":
96
+ print(f"Tracking error: {resp.get('message')}")
97
+ seg.reset()
98
+ else:
99
+ masks = resp
100
+
101
+ # Segmentation settings / model selection (nrmk_realtime_segmentation v0.2+)
102
+ caps = seg.get_capabilities()["data"]
103
+ current = seg.get_config()["data"]
104
+ seg.set_config(
105
+ {
106
+ "grounding_dino": {
107
+ "backbone": "Swin-B", # Swin-T | Swin-B
108
+ "box_threshold": 0.35,
109
+ "text_threshold": 0.25,
110
+ },
111
+ "dino_detection": {
112
+ "threshold": 0.5,
113
+ "target_multiplier": 25,
114
+ "img_multiplier": 50,
115
+ "background_threshold": -1.0,
116
+ "final_erosion_count": 10,
117
+ "segment_min_size": 20,
118
+ },
119
+ "sam2": {
120
+ "model": "facebook/sam2.1-hiera-large",
121
+ "use_legacy": False,
122
+ "compile": False,
123
+ "offload_state_to_cpu": False,
124
+ "offload_video_to_cpu": False,
125
+ },
126
+ }
127
+ )
128
+
129
+ # Remove an object (v0.2+, only when use_legacy=False)
130
+ seg.remove_object("cup_0")
131
+
132
+ seg.close()
133
+ ```
134
+
135
+ Additional Segmentation APIs and behaviors
136
+
137
+ - `benchmark=True` in the constructor enables timing counters (`call_time`, `call_count`) for `add_image_prompt`, `register_first_frame`, and `get_next`.
138
+ - `switch_compression_strategy()` lets you change the compression strategy at runtime.
139
+ - `register_first_frame()` returns `True`/`False` and raises `ValueError` if image prompts are missing when `use_image_prompt=True`.
140
+ - `register_first_frame()` accepts a list of prompt IDs when `use_image_prompt=True`.
141
+ - `get_next()` returns `None` if called before registration; it can also return the server error dict when available.
142
+ - `reset()` performs a server-side reset, while `finish()` clears only local state.
143
+ - Exposed state: `tracking_object_ids`, `current_frame_masks`, `invisible_object_ids`.
144
+ - Backward-compat alias: `NrmkRealtimeSegmentation`.
145
+
146
+ #### Segmentation v0.2 config summary (defaults/choices)
147
+ `seg.get_capabilities()` can differ depending on server configuration. The following reflects v0.2 defaults.
148
+
149
+ ```yaml
150
+ grounding_dino:
151
+ backbone:
152
+ choices:
153
+ - Swin-B
154
+ - Swin-T
155
+ default: Swin-T
156
+ box_threshold:
157
+ default: 0.35
158
+ min: 0.0
159
+ max: 1.0
160
+ text_threshold:
161
+ default: 0.25
162
+ min: 0.0
163
+ max: 1.0
164
+
165
+ dino_detection:
166
+ threshold:
167
+ default: 0.5
168
+ target_multiplier:
169
+ default: 25
170
+ img_multiplier:
171
+ default: 50
172
+ background_threshold:
173
+ default: -1.0
174
+ final_erosion_count:
175
+ default: 10
176
+ segment_min_size:
177
+ default: 20
178
+
179
+ sam2:
180
+ model:
181
+ choices:
182
+ - facebook/sam2-hiera-base-plus
183
+ - facebook/sam2-hiera-large
184
+ - facebook/sam2-hiera-small
185
+ - facebook/sam2-hiera-tiny
186
+ - facebook/sam2.1-hiera-base-plus
187
+ - facebook/sam2.1-hiera-large
188
+ - facebook/sam2.1-hiera-small
189
+ - facebook/sam2.1-hiera-tiny
190
+ default: facebook/sam2.1-hiera-large
191
+ use_legacy:
192
+ default: false
193
+ compile:
194
+ default: false
195
+ offload_state_to_cpu:
196
+ default: false
197
+ offload_video_to_cpu:
198
+ default: false
199
+ ```
200
+
201
+ #### Segmentation v0.2 notes and changes
202
+
203
+ - If SAM2 VRAM estimation fails, `seg.get_next()` may return `{"result":"ERROR"}`. Handle the error and call `reset` before re-registering.
204
+ - `compile=True` can slow down first-frame registration and `reset`.
205
+ - CPU offloading is most effective when both `offload_state_to_cpu=True` and `offload_video_to_cpu=True` are set (legacy mode does not support `offload_video_to_cpu`).
206
+ - `remove_object` is supported only when `use_legacy=False`.
207
+ - GroundingDINO added the Swin-B backbone and fixed prompt-token merge issues.
208
+
209
+ ### Pose Estimation
210
+
211
+ **Mesh upload**: Upload the mesh file (STL) to `/opt/meshes/` on the host PC. You can also use SSH directly.
212
+
213
+ ```python
214
+ from neuromeka_vfm import upload_mesh
215
+
216
+ upload_mesh(
217
+ host="192.168.10.63",
218
+ user="user",
219
+ password="pass",
220
+ local="mesh/my_mesh.stl", # local mesh path
221
+ remote="/opt/meshes/my_mesh.stl", # host mesh path (Docker volume)
222
+ )
223
+ ```
224
+
225
+ Initialization
226
+
227
+ ```python
228
+ from neuromeka_vfm import PoseEstimation
229
+
230
+ pose = PoseEstimation(host="192.168.10.72", port=5557)
231
+
232
+ pose.init(
233
+ mesh_path="/app/modules/foundation_pose/mesh/my_mesh.stl",
234
+ apply_scale=1.0,
235
+ track_refine_iter=3,
236
+ min_n_views=40,
237
+ inplane_step=60,
238
+ )
239
+ ```
240
+
241
+ - mesh_path: path to the mesh file (STL/OBJ). Initialization fails if missing.
242
+ - apply_scale: scalar applied after loading the mesh.
243
+ - STL in meters: 1.0 (no scaling)
244
+ - STL in centimeters: 0.01 (1 cm -> 0.01 m)
245
+ - STL in millimeters: 0.001 (1 mm -> 0.001 m)
246
+ - force_apply_color: if True, forces a solid color when the mesh lacks color data.
247
+ - apply_color: RGB tuple (0-255) used when `force_apply_color=True`.
248
+ - est_refine_iter: number of refinement iterations during registration (higher = more accurate, slower).
249
+ - track_refine_iter: number of refinement iterations per frame during tracking.
250
+ - min_n_views: minimum number of sampled camera views (affects rotation candidates).
251
+ - inplane_step: in-plane rotation step in degrees (smaller = more candidates).
252
+
253
+ Registration and tracking
254
+
255
+ ```python
256
+ # Registration (server defaults when iteration is omitted, check_vram=True pre-checks VRAM)
257
+ register_resp = pose.register(rgb=rgb0, depth=depth0, mask=mask0, K=cam_K, check_vram=True)
258
+
259
+ # Tracking (optionally limit search area with bbox_xywh)
260
+ track_resp = pose.track(rgb=rgb1, depth=depth1, K=cam_K, bbox_xywh=bbox_xywh)
261
+
262
+ pose.close()
263
+ ```
264
+
265
+ - cam_K: camera intrinsics.
266
+ - Large RGB resolution, large `min_n_views`, or small `inplane_step` can cause GPU VRAM errors.
267
+ - `check_vram=True` in `register` performs a pre-check to prevent server shutdown due to OOM.
268
+ - `iteration` in `register`/`track` can override the server default if provided.
269
+ - `reset()` resets the server state; `reset_object()` reuses the cached mesh to rebuild the rotation grid.
270
+ - Default host/port can come from `FPOSE_HOST` and `FPOSE_PORT` environment variables.
271
+ - Backward-compat alias: `FoundationPoseClient`.
272
+
273
+ <!--
274
+ ## Benchmark
275
+
276
+ Measured on local servers. Empty cells are not yet measured.
277
+
278
+ **RTX 5060**
279
+ | Task | Prompt | None (s) | JPEG (s) | PNG (s) | h264 (s) |
280
+ | --- | --- | --- | --- | --- | --- |
281
+ | Grounding DINO | text (human . cup .) | 0.86 | 0.35 | 0.50 | 0.52 |
282
+ | DINOv2 | image prompt | 0.85 | 0.49 | 0.65 | 0.63 |
283
+ | SAM2 | - | | | | |
284
+ | FoundationPose registration | - | | | | |
285
+ | FoundationPose track | - | | | | |
286
+
287
+ **RTX 5090**
288
+ | Task | Prompt | None (s) | JPEG (s) | PNG (s) | h264 (s) |
289
+ | --- | --- | --- | --- | --- | --- |
290
+ | Grounding DINO | text (human . cup .) | | | | |
291
+ | DINOv2 | image prompt | | | | |
292
+ | SAM2 | - | | | | |
293
+ | FoundationPose registration | - | 0.4 | - | | |
294
+ | FoundationPose track | - | 0.03 | | | |
295
+ -->
296
+
297
+ ## Release notes
298
+
299
+ - 0.1.2: Improved success detection for Segmentation responses (`result`/`success`/`status`), fixed image prompt registration/usage, added `check_vram` to PoseEstimation `register`.
300
+ - 0.1.1: Improved resource cleanup in PoseEstimation/Segmentation, use server defaults when iteration is omitted, added pose demo example.
301
+ - 0.1.0: Initial public release. Includes FoundationPose RPC client, real-time segmentation client, SSH-based mesh upload CLI/API.
@@ -1,14 +1,16 @@
1
1
  neuromeka_vfm/__init__.py,sha256=h5ODdWFgN7a9TBzF6Qfdyx5VxUr2hG0pFTwq57jEvDo,422
2
2
  neuromeka_vfm/compression.py,sha256=d2xOz4XBJZ60pPSXwQ5LPYwhpsaNORvNoY_0CUiAvt0,5191
3
+ neuromeka_vfm/generate_mesh.py,sha256=HV2dUfVXROPQ9kDDPmkwdn5E5gelWIQDsPsZuvZxI6E,3634
3
4
  neuromeka_vfm/pickle_client.py,sha256=Iw2fpxdnKB20oEUgsd0rJlvzOd5JhetphpKkF9qQcX0,591
5
+ neuromeka_vfm/point_cloud_utils.py,sha256=ZnCh8Xg6pLGoyi5ufZkz59HzE9RuRdihE8z-XNYT1PA,13261
4
6
  neuromeka_vfm/pose_estimation.py,sha256=3MUVhL0nMcpHApZDAzutS7fINPHcb-tu_WoXvNGU33E,2625
5
- neuromeka_vfm/segmentation.py,sha256=wae0_m225DUMD8Nm2A7iQm49QWeUas17B8PaoGGFt5w,6311
7
+ neuromeka_vfm/segmentation.py,sha256=8kmMut_gNJ3wa9F0l7iEYFNqHJzHJ5KPBzs7vSiwjqg,8464
6
8
  neuromeka_vfm/upload_mesh.py,sha256=aW5G9aE5OeiDN5pEVKDzMeV538U-I2iRYZvVZTfGsr4,2728
7
9
  neuromeka_vfm/examples/__init__.py,sha256=dEhb0FqhpEGNmg0pMunmrTlViIcxvd95fYEjZ49IOTQ,37
8
10
  neuromeka_vfm/examples/pose_demo.py,sha256=zq1Z0_kxQc4CB-ltfwm_oMoC7JLoN5GyeE3C6jKGQKw,13658
9
- neuromeka_vfm-0.1.4.dist-info/licenses/LICENSE,sha256=40cBWxFahhu0p_EB0GhU8oVIifVNmH1o2fZtx0bIif8,1076
10
- neuromeka_vfm-0.1.4.dist-info/METADATA,sha256=EPZ5Y6uuSz00whHesar6bKAqrUDNpGAfquZq9Pvd24U,7829
11
- neuromeka_vfm-0.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
- neuromeka_vfm-0.1.4.dist-info/entry_points.txt,sha256=Wl4XqiUt_GLQ08oTJtsYjLW0iYxZ52ysVd1-cN0kYP4,72
13
- neuromeka_vfm-0.1.4.dist-info/top_level.txt,sha256=uAH_yXikUvxXTSEnUC0M8Zl5ggxbnkYtXlmTfEG8MUk,14
14
- neuromeka_vfm-0.1.4.dist-info/RECORD,,
11
+ neuromeka_vfm-0.1.6.dist-info/licenses/LICENSE,sha256=40cBWxFahhu0p_EB0GhU8oVIifVNmH1o2fZtx0bIif8,1076
12
+ neuromeka_vfm-0.1.6.dist-info/METADATA,sha256=zHBuQ5sHDJjOdY96dmxLiwtE8OHIRJg0VcyzUB0rV48,10628
13
+ neuromeka_vfm-0.1.6.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
14
+ neuromeka_vfm-0.1.6.dist-info/entry_points.txt,sha256=Wl4XqiUt_GLQ08oTJtsYjLW0iYxZ52ysVd1-cN0kYP4,72
15
+ neuromeka_vfm-0.1.6.dist-info/top_level.txt,sha256=uAH_yXikUvxXTSEnUC0M8Zl5ggxbnkYtXlmTfEG8MUk,14
16
+ neuromeka_vfm-0.1.6.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,186 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: neuromeka_vfm
3
- Version: 0.1.4
4
- Summary: Client utilities for Neuromeka VFM FoundationPose RPC (upload meshes, call server)
5
- Author: Neuromeka
6
- License: MIT License
7
-
8
- Copyright (c) 2025 Neuromeka Co., Ltd.
9
-
10
- Permission is hereby granted, free of charge, to any person obtaining a copy
11
- of this software and associated documentation files (the "Software"), to deal
12
- in the Software without restriction, including without limitation the rights
13
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
- copies of the Software, and to permit persons to whom the Software is
15
- furnished to do so, subject to the following conditions:
16
-
17
- The above copyright notice and this permission notice shall be included in all
18
- copies or substantial portions of the Software.
19
-
20
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
- SOFTWARE.
27
-
28
- Classifier: Development Status :: 3 - Alpha
29
- Classifier: Intended Audience :: Developers
30
- Classifier: License :: OSI Approved :: MIT License
31
- Classifier: Programming Language :: Python :: 3
32
- Classifier: Programming Language :: Python :: 3.8
33
- Classifier: Programming Language :: Python :: 3.9
34
- Classifier: Programming Language :: Python :: 3.10
35
- Classifier: Programming Language :: Python :: 3.11
36
- Classifier: Programming Language :: Python :: 3.12
37
- Requires-Python: >=3.8
38
- Description-Content-Type: text/markdown
39
- License-File: LICENSE
40
- Requires-Dist: numpy
41
- Requires-Dist: pyzmq
42
- Requires-Dist: paramiko
43
- Requires-Dist: av
44
- Dynamic: license-file
45
-
46
- # neuromeka_vfm
47
-
48
- 클라이언트 PC에서 Segmentation (SAM2, Grounding DINO), Pose Estimation (NVIDIA FoundationPose) 서버(RPC, ZeroMQ)와 통신하고, SSH/SFTP로 호스트에 mesh를 업로드하는 간단한 유틸 패키지입니다.
49
-
50
- - Website: http://www.neuromeka.com
51
- - Source code: https://github.com/neuromeka-robotics/neuromeka_vfm
52
- - PyPI package: https://pypi.org/project/neuromeka_vfm/
53
- - Documents: https://docs.neuromeka.com
54
-
55
- ## Web UI (VFM Tester)를 통해 사용 가능
56
-
57
- - VFM Tester (Web UI): https://gitlab.com/neuromeka-group/nrmkq/nrmk_vfm_tester
58
-
59
-
60
- ## Installation
61
- ```bash
62
- pip install neuromeka_vfm
63
- ```
64
-
65
- ## Python API (예제로 보는 사용법)
66
-
67
- * 내 PC: 어플리케이션을 구현하고 이 패키지 (neuromeka_vfm)이 설치된 PC
68
- * 서버PC (Host): Segmentation, Pose Estimation 도커 서버가 설치된 PC. 내 PC에 도커를 설치할 경우 localhost 사용.
69
-
70
- ### Segmentation
71
- ```python
72
- from neuromeka_vfm import Segmentation
73
-
74
- seg = Segmentation(
75
- hostname="192.168.10.63",
76
- port=5432,
77
- compression_strategy="png", # none | png | jpeg | h264
78
- )
79
-
80
- # Image Prompt를 이용한 등록
81
- seg.add_image_prompt("drug_box", ref_rgb)
82
- seg.register_first_frame(frame=first_rgb,
83
- prompt="drug_box", # ID str
84
- use_image_prompt=True)
85
-
86
- # Text Prompt를 이용한 등록
87
- seg.register_first_frame(frame=first_rgb,
88
- prompt="box .", # Text prompt (끝에 띄어쓰기 . 필수)
89
- use_image_prompt=False)
90
-
91
- # 등록된 mask에 대한 SAM2 tracking
92
- masks = seg.get_next(next_rgb)
93
-
94
-
95
- seg.close()
96
- ```
97
-
98
- ### Pose Estimation
99
-
100
- **Mesh 파일 업로드**: 등록/인식하고자 하는 mesh 파일 (stl)을 호스트PC의 '/opt/meshes/' 경로에 업로드 (직접 SSH 통해 파일을 옮겨도 됨)
101
- ```python
102
- from neuromeka_vfm import upload_mesh
103
- upload_mesh(
104
- host="192.168.10.63",
105
- user="user",
106
- password="pass",
107
- local="mesh/my_mesh.stl", # 내 PC mesh 경로
108
- remote="/opt/meshes/my_mesh.stl", # 호스트PC mesh 경로 (도커 볼륨마운트)
109
- )
110
- ```
111
-
112
- 초기화
113
- ```python
114
- from neuromeka_vfm import PoseEstimation
115
-
116
- pose = PoseEstimation(host="192.168.10.72", port=5557)
117
-
118
- pose.init(
119
- mesh_path="/app/modules/foundation_pose/mesh/my_mesh.stl",
120
- apply_scale=1.0,
121
- track_refine_iter=3,
122
- min_n_views=40,
123
- inplane_step=60
124
- )
125
- ```
126
- - mesh_path: 사용할 물체 메시 파일(STL/OBJ 등) 경로. 없으면 초기화 실패.
127
- - apply_scale: 메시를 로드한 뒤 전체를 배율 조정하는 스케일 값. 단위 없는 곱셈 계수.
128
- - STL 모델이 미터 단위라면 1.0 (스케일 없음)
129
- - STL 모델이 센티미터 단위라면 0.01 (1 cm → 0.01 m)
130
- - STL 모델이 밀리미터 단위라면 0.001 (1 mm → 0.001 m)
131
- - force_apply_color: True일 때 메시에 단색 텍스처를 강제로 입힘. 메시가 색상을 안 가졌을 때 시각화 안정성을 위해 사용.
132
- - apply_color: force_apply_color가 True일 때 적용할 RGB 색상값(0~255) 튜플.
133
- - est_refine_iter: 초기 등록(register) 단계에서 포즈를 반복 정련하는 횟수. 값이 클수록 정확도 ↑, 연산 시간 ↑.
134
- - track_refine_iter: 추적(track) 단계에서 한 프레임당 포즈 정련 반복 횟수.
135
- - min_n_views: 초기 뷰 샘플링 시 생성할 최소 카메라 뷰 수(회전 후보 수에 영향).
136
- - inplane_step: in-plane 회전 샘플링 간격(도 단위). 값이 작을수록 더 많은 회전 후보를 생성.
137
-
138
-
139
- 인식 및 추적
140
- ```python
141
- # 초기 등록 (iteration 생략 시 서버 기본값, check_vram=True로 VRAM 사전 체크)
142
- register_resp = pose.register(rgb=rgb0, depth=depth0, mask=mask0, K=cam_K, check_vram=True)
143
-
144
- # 추적 (bbox_xywh로 탐색 범위 제한 가능)
145
- track_resp = pose.track(rgb=rgb1, depth=depth1, K=cam_K, bbox_xywh=bbox_xywh)
146
- pose.close()
147
- ```
148
- - cam_K: camera intrinsic
149
- - RGB resolution이 크거나, min_n_views 값이 크거나, inplane_step이 작을 경우 GPU VRAM 초과 에러 발생.
150
- - register check_vram=True 일 경우 VRAM 초과 사전 검사하여 shutdown 방지.
151
-
152
-
153
- ## VFM (Vision Foundation Model) latency benchmark
154
- 로컬 서버 구동 시 측정. 빈칸은 아직 미측정 항목입니다.
155
-
156
- **RTX 5060**
157
- | Task | Prompt | None (s) | JPEG (s) | PNG (s) | h264 (s) |
158
- | --- | --- | --- | --- | --- | --- |
159
- | Grounding DINO | text (human . cup .) | 0.86 | 0.35 | 0.50 | 0.52 |
160
- | DINOv2 | image prompt | 0.85 | 0.49 | 0.65 | 0.63 |
161
- | SAM2 | - | | | | |
162
- | FoundationPose registration | - | | | | |
163
- | FoundationPose track | - | | | | |
164
-
165
- **RTX 5090**
166
- | Task | Prompt | None (s) | JPEG (s) | PNG (s) | h264 (s) |
167
- | --- | --- | --- | --- | --- | --- |
168
- | Grounding DINO | text (human . cup .) | | | | |
169
- | DINOv2 | image prompt | | | | |
170
- | SAM2 | - | | | | |
171
- | FoundationPose registration | - | 0.4 | - | | |
172
- | FoundationPose track | - | 0.03 | | | |
173
-
174
- **Jetson Orin**
175
- | Task | Prompt | None (s) | JPEG (s) | PNG (s) | h264 (s) |
176
- | --- | --- | --- | --- | --- | --- |
177
- | Grounding DINO | text (human . cup .) | | | | |
178
- | DINOv2 | image prompt | | | | |
179
- | SAM2 | - | | | | |
180
- | FoundationPose registration | - | 0.4 | - | | |
181
- | FoundationPose track | - | 0.03 | | | |
182
-
183
- ## 릴리스 노트
184
- - 0.1.2: Segmentation 응답 성공 판정 개선(`result`/`success`/`status` 모두 지원), image prompt 등록/사용 오류 수정, PoseEstimation `register`에 `check_vram` 옵션 반영.
185
- - 0.1.1: PoseEstimation/Segmentation에서 리소스 정리 개선, iteration 미전달 시 서버 기본값 사용, pose 데모 예제 추가.
186
- - 0.1.0: 초기 공개 버전. FoundationPose RPC 클라이언트, 실시간 세그멘테이션 클라이언트, SSH 기반 mesh 업로드 CLI/API 포함.