PyPI - distflow - Versions diffs - 1.0.0__tar.gz - Mend

distflow 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

distflow-1.0.0/PKG-INFO +102 -0
distflow-1.0.0/README.md +77 -0
distflow-1.0.0/pyproject.toml +45 -0
distflow-1.0.0/src/distflow/__init__.py +0 -0
distflow-1.0.0/src/distflow/cache/__init__.py +0 -0
distflow-1.0.0/src/distflow/cache/protocol.py +7 -0
distflow-1.0.0/src/distflow/cache/redis_cache.py +122 -0
distflow-1.0.0/src/distflow/data/__init__.py +0 -0
distflow-1.0.0/src/distflow/data/data_formatter.py +73 -0
distflow-1.0.0/src/distflow/data/data_loader.py +64 -0
distflow-1.0.0/src/distflow/data/types.py +13 -0
distflow-1.0.0/src/distflow/embed/__init__.py +0 -0
distflow-1.0.0/src/distflow/embed/base.py +19 -0
distflow-1.0.0/src/distflow/embed/cache_wrapper.py +154 -0
distflow-1.0.0/src/distflow/embed/sentence_transformers.py +152 -0
distflow-1.0.0/src/distflow/embed/types.py +13 -0
distflow-1.0.0/src/distflow/embed/vllm.py +133 -0
distflow-1.0.0/src/distflow/mmd.py +175 -0
distflow-1.0.0/src/distflow/utils/__init__.py +0 -0
distflow-1.0.0/src/distflow/utils/logger.py +126 -0
distflow-1.0.0/src/distflow/utils/stats.py +111 -0
distflow-1.0.0/src/distflow/utils/timing.py +106 -0

distflow-1.0.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,102 @@
+Metadata-Version: 2.3
+Name: distflow
+Version: 1.0.0
+Summary: Distance Computation Package for Data Preparation Bench
+Requires-Dist: addict>=2.4.0
+Requires-Dist: aiohttp>=3.11.0
+Requires-Dist: datasets>=2.14.4
+Requires-Dist: modelscope>=1.34.0
+Requires-Dist: pandas>=2.3.3
+Requires-Dist: pydantic>=2.12.5
+Requires-Dist: pyyaml>=6.0
+Requires-Dist: redis>=7.3.0
+Requires-Dist: scikit-learn>=1.8.0
+Requires-Dist: sentence-transformers>=5.3.0
+Requires-Dist: torch>=2.6.0
+Requires-Dist: transformers>=4.53.0
+Requires-Dist: pre-commit>=4.2.0 ; extra == 'dev'
+Requires-Dist: pyright>=1.1.408 ; extra == 'dev'
+Requires-Dist: pytest>=8.4.1 ; extra == 'dev'
+Requires-Dist: vllm>=0.8.5.post1 ; extra == 'vllm'
+Requires-Python: >=3.12, <3.13
+Provides-Extra: dev
+Provides-Extra: vllm
+Description-Content-Type: text/markdown
+# Data-Preparation-Bench
+A benchmark for evaluating the data preparation capabilities of large language models (LLMs). The benchmark is organized into two modules:
+## Modules
+### 1. Data Synthesis & Augmentation
+Given raw metadata, the model is tasked with synthesizing or augmenting datasets to improve downstream model training.
+### 2. Data Quality Assessment
+Given raw metadata, the model is tasked with predicting the training data's impact on downstream task performance.
+## Quick Start
+### Usage
+This project uses [uv](https://docs.astral.sh/uv/) for dependency management. To get started:
+```bash
+git clone https://github.com/haolpku/Data-Preparation-Bench.git
+cd Data-Preparation-Bench
+uv sync
+```
+To use your own datasets, modify the configuration dictionaries and formatters in [compute_mmd.py](./examples/compute_mmd.py):
+```python
+DS1_CONFIG = {
+    "name": "oda-math",
+    "data_path": "OpenDataArena/ODA-Math-460k",
+    "data_size": 5000,
+    "split": "train",
+    "shuffle_seed": 42,
+}
+formatter1 = AlpacaFormatter(
+    user_key="question",
+    assistant_key="response",
+)
+DS2_CONFIG = {
+    "name": "infinity-instruct",
+    "data_path": "BAAI/Infinity-Instruct",
+    "data_size": 5000,
+    "split": "train",
+    "shuffle_seed": 42,
+}
+formatter2 = ShareGptFormatter(
+    conversations_key="conversations",
+)
+```
+Typically, you only need to update `data_path` with your dataset and define a formatter that converts raw items to the required format. After making these changes, run the MMD computation with:
+```bash
+uv run examples/compute_mmd.py
+```
+### Development
+To set up the development environment locally:
+```bash
+uv sync --extra dev
+uv run pre-commit install
+```
+Before committing, format and lint the code:
+```bash
+uv run pre-commit run --all-files
+```
+## Experiment Settings
+Please refer to [Experiment.md](./Experiment.md) for detailed experiment configurations.

distflow-1.0.0/README.md ADDED Viewed

@@ -0,0 +1,77 @@
+# Data-Preparation-Bench
+A benchmark for evaluating the data preparation capabilities of large language models (LLMs). The benchmark is organized into two modules:
+## Modules
+### 1. Data Synthesis & Augmentation
+Given raw metadata, the model is tasked with synthesizing or augmenting datasets to improve downstream model training.
+### 2. Data Quality Assessment
+Given raw metadata, the model is tasked with predicting the training data's impact on downstream task performance.
+## Quick Start
+### Usage
+This project uses [uv](https://docs.astral.sh/uv/) for dependency management. To get started:
+```bash
+git clone https://github.com/haolpku/Data-Preparation-Bench.git
+cd Data-Preparation-Bench
+uv sync
+```
+To use your own datasets, modify the configuration dictionaries and formatters in [compute_mmd.py](./examples/compute_mmd.py):
+```python
+DS1_CONFIG = {
+    "name": "oda-math",
+    "data_path": "OpenDataArena/ODA-Math-460k",
+    "data_size": 5000,
+    "split": "train",
+    "shuffle_seed": 42,
+}
+formatter1 = AlpacaFormatter(
+    user_key="question",
+    assistant_key="response",
+)
+DS2_CONFIG = {
+    "name": "infinity-instruct",
+    "data_path": "BAAI/Infinity-Instruct",
+    "data_size": 5000,
+    "split": "train",
+    "shuffle_seed": 42,
+}
+formatter2 = ShareGptFormatter(
+    conversations_key="conversations",
+)
+```
+Typically, you only need to update `data_path` with your dataset and define a formatter that converts raw items to the required format. After making these changes, run the MMD computation with:
+```bash
+uv run examples/compute_mmd.py
+```
+### Development
+To set up the development environment locally:
+```bash
+uv sync --extra dev
+uv run pre-commit install
+```
+Before committing, format and lint the code:
+```bash
+uv run pre-commit run --all-files
+```
+## Experiment Settings
+Please refer to [Experiment.md](./Experiment.md) for detailed experiment configurations.

distflow-1.0.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,45 @@
+[project]
+name = "distflow"
+version = "1.0.0"
+description = "Distance Computation Package for Data Preparation Bench"
+readme = "README.md"
+requires-python = ">=3.12,<3.13"
+dependencies = [
+    "addict>=2.4.0",
+    "aiohttp>=3.11.0",
+    "datasets>=2.14.4",
+    "modelscope>=1.34.0",
+    "pandas>=2.3.3",
+    "pydantic>=2.12.5",
+    "pyyaml>=6.0",
+    "redis>=7.3.0",
+    "scikit-learn>=1.8.0",
+    "sentence-transformers>=5.3.0",
+    "torch>=2.6.0",
+    "transformers>=4.53.0",
+]
+[project.optional-dependencies]
+vllm = ["vllm>=0.8.5.post1"]
+dev = [
+    "pre-commit>=4.2.0",
+    "pyright>=1.1.408",
+    "pytest>=8.4.1",
+]
+[tool.black]
+line-length = 88
+target-version = ['py312']
+include = '\.pyi?$'
+[tool.isort]
+profile = "black"
+line_length = 88
+src_paths = ["src", "tests"]
+[build-system]
+requires = ["uv_build>=0.9.5,<0.12"]
+build-backend = "uv_build"
+[tool.uv]
+index-url = "https://mirrors.aliyun.com/pypi/simple"

distflow-1.0.0/src/distflow/__init__.py ADDED Viewed

File without changes

distflow-1.0.0/src/distflow/cache/__init__.py ADDED Viewed

File without changes

distflow-1.0.0/src/distflow/cache/protocol.py ADDED Viewed

@@ -0,0 +1,7 @@
+from typing import Any, Protocol
+class CacheProtocol(Protocol):
+    async def load_cache(self, cache_key: str) -> dict[str, Any] | None: ...
+    async def save_cache(self, cache_key: str, cache_value: dict[str, Any]) -> bool: ...

distflow-1.0.0/src/distflow/cache/redis_cache.py ADDED Viewed

@@ -0,0 +1,122 @@
+import asyncio
+import json
+from typing import Any
+from redis.asyncio import Redis
+from distflow.utils import logger
+class RedisCache:
+    """使用 Redis 作为缓存后端的实现.
+    通过 Redis 客户端直接与 Redis 服务通信，实现分布式缓存。
+    使用 semaphore 限制并发请求数量。
+    """
+    def __init__(
+        self,
+        redis_url: str = "redis://127.0.0.1:6379",
+        max_concurrent_requests: int = 50,
+        redis_db: int = 0,
+    ) -> None:
+        """初始化Redis缓存.
+        Args:
+            redis_url: Redis 连接 URL，例如 "redis://127.0.0.1:6379"
+            max_concurrent_requests: 最大并发请求数
+            redis_db: Redis 数据库编号，默认为 0
+        """
+        self._semaphore = asyncio.Semaphore(max_concurrent_requests)
+        # 初始化 Redis 客户端
+        self._redis: Redis | None = None
+        self._redis_url = redis_url
+        self._redis_db = redis_db
+    def _get_redis(self) -> Redis:
+        """获取或创建 Redis 客户端."""
+        if self._redis is None:
+            self._redis = Redis.from_url(
+                self._redis_url,
+                db=self._redis_db,
+                decode_responses=True,
+            )
+            try:
+                # 测试连接
+                self._redis.ping()
+                logger.info(
+                    f"成功连接到 Redis: {self._redis_url}, DB: {self._redis_db}"
+                )
+            except Exception as e:
+                logger.error(
+                    f"无法连接到 Redis: {self._redis_url}, DB: {self._redis_db}, 错误: {e}"
+                )
+                raise ConnectionError(
+                    f"无法连接到 Redis: {self._redis_url}, DB: {self._redis_db}"
+                ) from e
+        return self._redis
+    async def load_cache(self, cache_key: str) -> dict[str, Any] | None:
+        """从 Redis 获取单个缓存值（受 semaphore 限制并发）.
+        Args:
+            cache_key: 缓存键
+        Returns:
+            缓存值字典，如果不存在则返回 None
+        """
+        for attempt in range(3):
+            async with self._semaphore:
+                try:
+                    redis = self._get_redis()
+                    cached_data = await redis.get(cache_key)
+                    if cached_data:
+                        return json.loads(cached_data)
+                    return None
+                except Exception as e:
+                    logger.warning(
+                        f"Redis 缓存查询失败 {attempt + 1} / 3: {type(e).__name__}: {e}"
+                    )
+                    await asyncio.sleep(0.1 * (attempt + 1))  # 简单的指数退避
+                    self._redis = None  # 重置 Redis 客户端以尝试重新连接
+        return None
+    async def save_cache(self, cache_key: str, cache_value: dict[str, Any]) -> bool:
+        """设置单个缓存值到 Redis（受 semaphore 限制并发）.
+        Args:
+            cache_key: 缓存键
+            cache_value: 缓存值
+        Returns:
+            是否成功
+        """
+        for attempt in range(3):
+            async with self._semaphore:
+                try:
+                    redis = self._get_redis()
+                    serialized = json.dumps(cache_value)
+                    await redis.set(cache_key, serialized)
+                    return True
+                except Exception as e:
+                    logger.warning(
+                        f"Redis 缓存写入失败 {attempt + 1} / 3: {type(e).__name__}: {e}"
+                    )
+                    await asyncio.sleep(0.1 * (attempt + 1))  # 简单的指数退避
+                    self._redis = None  # 重置 Redis 客户端以尝试重新连接
+        return False
+    async def close(self) -> None:
+        """关闭 Redis 连接."""
+        if self._redis:
+            await self._redis.close()
+            logger.info("Redis 连接已关闭")
+    async def __aenter__(self) -> "RedisCache":
+        """异步上下文管理器入口."""
+        return self
+    async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
+        """异步上下文管理器退出."""
+        await self.close()

distflow-1.0.0/src/distflow/data/__init__.py ADDED Viewed

File without changes

distflow-1.0.0/src/distflow/data/data_formatter.py ADDED Viewed

@@ -0,0 +1,73 @@
+from __future__ import annotations
+from typing import Any, Protocol, cast, runtime_checkable
+from distflow.data.types import DatasetProcessOutputItem, MessageData
+@runtime_checkable
+class FormatterProtocol(Protocol):
+    def format(self, raw_item: dict[str, Any]) -> DatasetProcessOutputItem: ...
+class AlpacaFormatter:
+    def __init__(self, *, user_key: str, assistant_key: str) -> None:
+        self.user_key = user_key
+        self.assistant_key = assistant_key
+    def format(self, raw_item: dict[str, Any]) -> DatasetProcessOutputItem:
+        assert (
+            self.user_key in raw_item
+        ), f"User key '{self.user_key}' not found in raw item"
+        assert (
+            self.assistant_key in raw_item
+        ), f"Assistant key '{self.assistant_key}' not found in raw item"
+        user_content = raw_item[self.user_key]
+        assert isinstance(
+            user_content, str
+        ), f"User content must be a string, got {type(user_content).__name__}: {user_content}"
+        assistant_content = raw_item[self.assistant_key]
+        assert isinstance(
+            assistant_content, str
+        ), f"Assistant content must be a string, got {type(assistant_content).__name__}: {assistant_content}"
+        return DatasetProcessOutputItem(
+            messages=[
+                cast(MessageData, {"role": "user", "content": user_content}),
+                cast(MessageData, {"role": "assistant", "content": assistant_content}),
+            ],
+            meta={
+                "user_key": self.user_key,
+                "assistant_key": self.assistant_key,
+                "raw_item": raw_item,
+            },
+        )
+class ShareGptFormatter:
+    def __init__(self, *, conversations_key: str) -> None:
+        self.conversations_key = conversations_key
+    def format(self, raw_item: dict[str, Any]) -> DatasetProcessOutputItem:
+        assert (
+            self.conversations_key in raw_item
+        ), f"Conversations key '{self.conversations_key}' not found in raw item"
+        conversations = raw_item[self.conversations_key]
+        assert isinstance(
+            conversations, list
+        ), f"Conversations must be a list, got {type(conversations).__name__}: {conversations}"
+        messages: list[MessageData] = []
+        for conv in conversations:
+            if isinstance(conv, dict):
+                role = conv.get("role")
+                content = conv.get("content")
+                if role is not None and content is not None:
+                    messages.append(
+                        cast(MessageData, {"role": role, "content": content})
+                    )
+        return DatasetProcessOutputItem(
+            messages=messages,
+            meta={"conversations_key": self.conversations_key, "raw_item": raw_item},
+        )

distflow-1.0.0/src/distflow/data/data_loader.py ADDED Viewed

@@ -0,0 +1,64 @@
+import builtins
+import random
+from typing import Any, Literal, cast
+from distflow.data.data_formatter import FormatterProtocol
+from distflow.data.types import DatasetProcessOutputItem
+from distflow.utils import logger
+def load_dataset(
+    dataset_name: str,
+    data_path: str,
+    load_type: Literal["datasets", "modelscope", "pandas"],
+    formatter: FormatterProtocol,
+    data_size: int = -1,
+    split: str = "train",
+    sep: str = "\t",
+    dtype: str = "str",
+    shuffle_seed: int = 42,
+    use_json: bool = False,
+) -> tuple[str, list[DatasetProcessOutputItem]]:
+    logger.info(f"开始加载数据集: {dataset_name}, 路径: {data_path}, 类型: {load_type}")
+    # 数据大小
+    logger.debug(f"数据大小限制: {data_size if data_size > 0 else '全部'}")
+    match load_type:
+        case "datasets":
+            from datasets import load_dataset
+            logger.debug(f"使用 datasets 加载, split={split}, use_json={use_json}")
+            if use_json:
+                dataset = load_dataset("json", data_files=data_path, split=split)
+            else:
+                dataset = load_dataset(path=data_path, split=split)
+        case "modelscope":
+            from modelscope.msdatasets import MsDataset
+            logger.debug(f"使用 modelscope 加载, split={split}")
+            dataset = MsDataset.load(data_path, split=split)
+        case "pandas":
+            from datasets import Dataset, load_dataset
+            from pandas import read_csv
+            logger.debug("使用 pandas 加载")
+            dtype_actual = getattr(builtins, dtype)
+            df = read_csv(data_path, sep=sep, dtype=dtype_actual)
+            dataset = Dataset.from_pandas(df)
+    logger.info(f"数据集加载完成，总样本数: {len(dataset)}")
+    random.seed(shuffle_seed)
+    logger.debug(f"使用随机种子: {shuffle_seed}")
+    random_indices = list(range(len(dataset)))
+    if data_size > 0 and data_size < len(dataset):
+        logger.info(f"随机采样 {data_size} 条数据")
+        random_indices = random.sample(random_indices, data_size)
+    else:
+        logger.info("使用全部数据")
+        random.shuffle(random_indices)
+    sampled_data = cast(list[dict[str, Any]], [dataset[i] for i in random_indices])
+    logger.debug(f"采样完成，开始格式化数据")
+    formatted_data = [formatter.format(data_item) for data_item in sampled_data]
+    return dataset_name, formatted_data

distflow-1.0.0/src/distflow/data/types.py ADDED Viewed

@@ -0,0 +1,13 @@
+from typing import Any
+from pydantic import BaseModel
+class MessageData(BaseModel):  # type: ignore[misc]
+    role: str
+    content: str | dict[str, Any]
+class DatasetProcessOutputItem(BaseModel):  # type: ignore[misc]
+    messages: list[MessageData]
+    meta: dict[str, Any]

distflow-1.0.0/src/distflow/embed/__init__.py ADDED Viewed

File without changes

distflow-1.0.0/src/distflow/embed/base.py ADDED Viewed

@@ -0,0 +1,19 @@
+from abc import ABC, abstractmethod
+from distflow.embed.types import EmbeddingInputItem, EmbeddingResult
+class BaseEmbed(ABC):
+    def __init__(self, model_name: str) -> None:
+        self.model_name = model_name
+    @abstractmethod
+    def embed(self, dataset: list[EmbeddingInputItem]) -> list[EmbeddingResult]:
+        """异步嵌入计算.
+        Args:
+            dataset: 待嵌入的数据项列表
+        Returns:
+            嵌入结果列表
+        """

distflow-1.0.0/src/distflow/embed/cache_wrapper.py ADDED Viewed

@@ -0,0 +1,154 @@
+import asyncio
+import hashlib
+import json
+from collections.abc import Coroutine
+from typing import Any
+from distflow.cache.protocol import CacheProtocol
+from distflow.embed.base import BaseEmbed
+from distflow.embed.types import EmbeddingInputItem, EmbeddingResult
+from distflow.utils import logger
+def dict_to_hash(d: dict[Any, Any]) -> str:
+    """生成字典的SHA256哈希摘要"""
+    s = json.dumps(d, sort_keys=True).encode()
+    return hashlib.sha256(s).hexdigest()
+class CachedEmbed(BaseEmbed):
+    """使用 Redis 作为缓存后端的嵌入包装器.
+    通过 RedisCache 类与 Redis 服务通信，实现分布式缓存。
+    """
+    def __init__(
+        self,
+        embedder: BaseEmbed,
+        cache: CacheProtocol,
+        cache_model_id: str | None = None,
+        legacy_key: bool = False,
+    ) -> None:
+        """初始化缓存嵌入器.
+        Args:
+            embedder: 底层嵌入器，用于计算未缓存的数据
+            cache: 符合 CacheProtocol 的缓存实现
+            cache_model_id: 用于缓存键的模型标识符，默认为模型路径。
+                            可用于在移动模型后仍使用旧缓存。
+            legacy_key: 是否使用旧版缓存键格式（包含完整 data_item），
+                       默认为 False（使用新版：仅 model_id + messages）
+        """
+        self.embedder = embedder
+        self._cache = cache
+        self.model_path = (
+            getattr(embedder, "model_name", None)
+            or getattr(embedder, "model_path", None)
+            or "unknown"
+        )
+        # 用于缓存键的模型标识符
+        self.cache_model_id = cache_model_id if cache_model_id else self.model_path
+        self.legacy_key = legacy_key
+        super().__init__(self.model_path)
+    def _build_cache_key(self, item: EmbeddingInputItem) -> str:
+        """构建缓存键.
+        Args:
+            item: 输入数据项
+        Returns:
+            SHA256 哈希键
+        """
+        if self.legacy_key:
+            # 旧版格式：包含完整 data_item（包含 messages 和 meta）
+            key_payload = {
+                "model_path": self.model_path,
+                "data_item": item.model_dump(),
+            }
+        else:
+            # 新版格式：仅使用 cache_model_id 和 messages（不含 meta）
+            key_payload = {
+                "model_id": self.cache_model_id,
+                "messages": [msg.model_dump() for msg in item.messages],
+            }
+        return dict_to_hash(key_payload)
+    def embed(self, dataset: list[EmbeddingInputItem]) -> list[EmbeddingResult]:
+        """异步执行嵌入计算，使用 Redis 缓存.
+        Args:
+            dataset: 待嵌入的数据项列表
+        Returns:
+            嵌入结果列表
+        """
+        logger.info(f"开始缓存嵌入计算，数据量: {len(dataset)}")
+        # 并发查询所有缓存
+        cache_keys = [self._build_cache_key(item) for item in dataset]
+        cache_tasks = [self._cache.load_cache(key) for key in cache_keys]
+        async def _run_all_get_cache() -> list[dict[str, Any] | None | BaseException]:
+            return await asyncio.gather(*cache_tasks, return_exceptions=True)
+        cached_values = asyncio.run(_run_all_get_cache())
+        # 分离缓存命中和未命中的项
+        results: list[EmbeddingResult | None] = [None] * len(dataset)
+        missing_items: list[EmbeddingInputItem] = []
+        missing_indices: list[int] = []
+        missing_keys: list[str] = []
+        for idx, (item, key, cached_result) in enumerate(
+            zip(dataset, cache_keys, cached_values)
+        ):
+            # 处理异常结果
+            if isinstance(cached_result, BaseException):
+                logger.debug(f"缓存查询异常，将重新计算: {cached_result}")
+                missing_items.append(item)
+                missing_indices.append(idx)
+                missing_keys.append(key)
+            elif cached_result is None:
+                missing_items.append(item)
+                missing_indices.append(idx)
+                missing_keys.append(key)
+            else:
+                results[idx] = EmbeddingResult(
+                    embedding=cached_result["embedding"],
+                    data_item=item,
+                    meta=cached_result.get("meta", item.meta),
+                )
+                logger.debug(f"缓存命中: {key[:16]}...")
+        logger.info(f"缓存命中: {len(dataset) - len(missing_items)}/{len(dataset)}")
+        # 计算未缓存的嵌入
+        if missing_items:
+            new_results = self.embedder.embed(missing_items)
+            # 并发写入缓存
+            write_tasks: list[Coroutine[Any, Any, bool]] = []
+            for key, idx, result in zip(missing_keys, missing_indices, new_results):
+                cache_value = {
+                    "embedding": result.embedding,
+                    "meta": result.meta,
+                }
+                write_tasks.append(self._cache.save_cache(key, cache_value))
+                results[idx] = EmbeddingResult(
+                    embedding=result.embedding,
+                    data_item=dataset[idx],
+                    meta=result.meta,
+                )
+            # 等待所有写入完成
+            async def _run_all_save_cache() -> list[bool | BaseException]:
+                return await asyncio.gather(*write_tasks, return_exceptions=False)
+            write_results = asyncio.run(_run_all_save_cache())
+            success_count = sum(1 for r in write_results if r is True)
+            logger.info(f"缓存写入完成: {success_count}/{len(write_tasks)} 成功")
+        logger.info(f"嵌入计算完成，共 {len(results)} 条结果")
+        return [result for result in results if result is not None]