PyPI - UniTok - Versions diffs - 3.4.8__tar.gz → 3.4.9__tar.gz - Mend

UniTok 3.4.8tar.gz → 3.4.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

{UniTok-3.4.8 → UniTok-3.4.9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: UniTok
-Version: 3.4.8
+Version: 3.4.9
 Summary: Unified Tokenizer
 Home-page: https://github.com/Jyonn/UnifiedTokenizer
 Author: Jyonn Liu
@@ -10,18 +10,25 @@ Keywords: token,tokenizer
 Platform: any
 Description-Content-Type: text/markdown
-# UniTok V3
+# UniTok V3: 类SQL数据预处理工具包
+Updated on 2023.11.04
 ## 1. 简介
-UniTok 是一个强大的文本预处理工具包，它提供了一整套的数据预处理工具。UniTok 主要包括两大部分：`UniTok` 和 `UniDep`。`UniTok` 负责统一处理数据，它包括分词器（Tokenizers），数据列（Columns）等组件。`UniDep` 负责数据依赖的处理，包括词汇表（Vocabs），元数据（Meta）等。
+UniTok 是史上第一个类SQL的数据预处理工具包，提供了一整套的数据封装和编辑工具。
+UniTok 主要包括两大组件：负责统一数据处理的`UniTok` 和 负责数据读取和二次编辑的`UniDep`：
+- `UniTok` 通过分词器（Tokenizers）和数据列（Columns）等组件将生数据（Raw Data）进行分词与ID化操作，并最终以numpy数组格式存储为一张数据表。
+- `UniDep` 读取由`UniTok`生成的数据表以及元数据（如词表信息），可以直接与Pytorch的Dataset结合使用，也可以完成二次编辑、和其他数据表合并、导出等操作。
+- 在3.1.9版本后，我们推出`Fut` 组件，它是`UniTok`的替代品，可以更快速地完成数据预处理。
 ## 2. 安装
 使用pip安装：
 ```bash
-pip install unitok>=3.0.11
+pip install unitok>=3.4.8
 ```
 ## 3. 主要功能

{UniTok-3.4.8 → UniTok-3.4.9}/README.md RENAMED Viewed

@@ -1,15 +1,22 @@
-# UniTok V3
+# UniTok V3: 类SQL数据预处理工具包
+Updated on 2023.11.04
 ## 1. 简介
-UniTok 是一个强大的文本预处理工具包，它提供了一整套的数据预处理工具。UniTok 主要包括两大部分：`UniTok` 和 `UniDep`。`UniTok` 负责统一处理数据，它包括分词器（Tokenizers），数据列（Columns）等组件。`UniDep` 负责数据依赖的处理，包括词汇表（Vocabs），元数据（Meta）等。
+UniTok 是史上第一个类SQL的数据预处理工具包，提供了一整套的数据封装和编辑工具。
+UniTok 主要包括两大组件：负责统一数据处理的`UniTok` 和 负责数据读取和二次编辑的`UniDep`：
+- `UniTok` 通过分词器（Tokenizers）和数据列（Columns）等组件将生数据（Raw Data）进行分词与ID化操作，并最终以numpy数组格式存储为一张数据表。
+- `UniDep` 读取由`UniTok`生成的数据表以及元数据（如词表信息），可以直接与Pytorch的Dataset结合使用，也可以完成二次编辑、和其他数据表合并、导出等操作。
+- 在3.1.9版本后，我们推出`Fut` 组件，它是`UniTok`的替代品，可以更快速地完成数据预处理。
 ## 2. 安装
 使用pip安装：
 ```bash
-pip install unitok>=3.0.11
+pip install unitok>=3.4.8
 ```
 ## 3. 主要功能

{UniTok-3.4.8 → UniTok-3.4.9}/UniTok/meta.py RENAMED Viewed

@@ -4,9 +4,15 @@ import warnings
 from typing import List, Union
-class Col:
+class Ent:
+    def __init__(self, name, **kwargs):
+        self.name = name
+class Col(Ent):
     def __init__(self, name, voc=None, max_length=None, padding=None, vocab=None):
-        self.name: str = name
+        super().__init__(name=name)
         self.voc: Union[Voc, str] = voc or vocab
         self.max_length = max_length
         self.padding = padding
@@ -25,9 +31,10 @@ class Col:
         return info
-class Voc:
+class Voc(Ent):
     def __init__(self, name, size, cols, store_dir, vocab=None):
-        self.name: str = name
+        super().__init__(name=name)
         self.size: int = size
         self.cols: List[Union[Col, str]] = cols
         self.store_dir = store_dir

{UniTok-3.4.8 → UniTok-3.4.9}/UniTok/unidep.py RENAMED Viewed

@@ -3,7 +3,7 @@ import os
 import random
 import warnings
 from collections import OrderedDict
-from typing import Dict, List, Callable, Union, Optional
+from typing import Dict, List, Callable, Union, Optional, cast
 import numpy as np
 import tqdm
@@ -16,11 +16,20 @@ from .vocabs import Vocabs
 class UniDep:
     VER = Meta.VER
-    def __init__(self, store_dir, silent=False):
+    def __init__(self, store_dir, verbose=True, silent=None):
+        """
+        Unified Data Depot Initialization
+        :param store_dir: Store directory of the data processed by our UniTok or Fut
+        :param verbose:
+        """
         self.store_dir = os.path.expanduser(store_dir)
         self.meta = Meta(self.store_dir)
-        self.silent = silent
+        if silent is not None:
+            warnings.warn('unidep.silent is deprecated, '
+                          'use verbose instead (will be removed in 4.x version)', DeprecationWarning)
+            verbose = not silent
+        self.verbose = verbose
         self.cached = False
         self.cached_samples = []
@@ -28,8 +37,8 @@ class UniDep:
         self.data_path = os.path.join(self.store_dir, 'data.npy')
         self.data = np.load(self.data_path, allow_pickle=True)
         try:
-            # noinspection PyTypeChecker
-            self.data: dict = self.data.item()
+            self.data = self.data.item()
+            self.data = cast(dict, self.data)
         except Exception as err:
             print(err)
             return
@@ -53,7 +62,7 @@ class UniDep:
             self.vocabs.append(Vocab(name=vocab_name).load(self.store_dir))
         for voc in self.vocs:
             self.vocs[voc].vocab = self.vocabs[voc]
-        self.id2index = self.vocabs[self.id_voc.name].o2i
+        self.id2index = self.id_voc.vocab.o2i
         self.unions = OrderedDict()  # type: Dict[str, List[UniDep]]
         self._deep_union = False
@@ -74,7 +83,7 @@ class UniDep:
         silent-aware printer
         """
-        if self.silent:
+        if self.verbose:
             return
         print(*args, **kwargs)
@@ -111,7 +120,7 @@ class UniDep:
         self.cached = False
         self.cached_samples = [None] * self._sample_size
-        for sample in tqdm.tqdm(self, disable=self.silent):
+        for sample in tqdm.tqdm(self, disable=self.verbose):
             self.cached_samples[sample[self.id_col]] = sample
         self.cached = True
@@ -139,7 +148,7 @@ class UniDep:
         """
         introduction = f"""
         UniDep ({self.meta.parse_version(self.meta.version)}): {self.store_dir}
         Sample Size: {self.sample_size}
         Id Column: {self.id_col}
         Columns:\n"""
@@ -187,18 +196,26 @@ class UniDep:
             raise ValueError('deep_union can not be changed after union-ed')
         self._deep_union = value
-    def union(self, *depots: 'UniDep'):
+    def union(self, *depots: 'UniDep', union_col: str = None):
         """
         union depots, where id columns in other depots must exist in current main depot
         """
+        if union_col and union_col not in self.cols:
+            raise ValueError(f'current depot has no column named {union_col}')
         for depot in depots:
             # check if id col exists in current depot
-            if depot.id_col not in self.cols:
-                raise ValueError('current depot has no column named {}'.format(depot.id_col))
-            if depot.id_col not in self.unions:
-                self.unions[depot.id_col] = []
-            self.unions[depot.id_col].append(depot)
+            if not union_col:
+                assert depot.id_col in self.cols, (
+                    ValueError(f'current depot has no column named {union_col}'))
+            else:
+                assert self.cols[union_col].voc == depot.cols[depot.id_col].voc, (
+                    ValueError(f'the vocabs of union col {union_col} and target id col {depot.id_col} are not matched'))
+            current_union_col = union_col or depot.id_col
+            if current_union_col not in self.unions:
+                self.unions[current_union_col] = []
+            self.unions[current_union_col].append(depot)
             self.cols = self._merge_cols(self.cols, depot.cols)
             self.vocs = self._merge_vocs(self.vocs, depot.vocs)
@@ -210,7 +227,7 @@ class UniDep:
             columns = {col_name: [] for col_name in depot.cols}
-            for index in self.data[depot.id_col]:
+            for index in self.data[current_union_col]:
                 for col_name in columns:
                     columns[col_name].append(depot.data[col_name][index])
@@ -295,7 +312,7 @@ class UniDep:
         """
         visible_indexes = []
-        for sample in tqdm.tqdm(self, disable=self.silent):
+        for sample in tqdm.tqdm(self, disable=self.verbose):
             target = sample if col is None else sample[col]
             if filter_func(target):
                 visible_indexes.append(sample[self.id_col])
@@ -333,7 +350,7 @@ class UniDep:
         for voc in self.vocabs:
             self.vocabs[voc].save(store_dir)
-        for sample in tqdm.tqdm(self, disable=self.silent):
+        for sample in tqdm.tqdm(self, disable=self.verbose):
             for col_name in sample:
                 if col_name not in data:
                     data[col_name] = []
@@ -341,7 +358,7 @@ class UniDep:
         for col_name in data:
             data[col_name] = np.array(data[col_name])
-        np.save(os.path.join(store_dir, 'data.npy'), data, allow_pickle=True)
+        np.save(os.path.join(store_dir, 'data.npy'), cast(data, np.ndarray), allow_pickle=True)
         meta_data = self.meta.get_info()
         json.dump(meta_data, open(os.path.join(store_dir, 'meta.data.json'), 'w'), indent=2)
@@ -373,7 +390,7 @@ class UniDep:
     @staticmethod
     def _get_max_length(values):
-        if isinstance(values[0], list):
+        if isinstance(values[0], list) or isinstance(values[0], np.ndarray):
             return max([len(value) for value in values])
         return None

{UniTok-3.4.8 → UniTok-3.4.9}/UniTok.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: UniTok
-Version: 3.4.8
+Version: 3.4.9
 Summary: Unified Tokenizer
 Home-page: https://github.com/Jyonn/UnifiedTokenizer
 Author: Jyonn Liu
@@ -10,18 +10,25 @@ Keywords: token,tokenizer
 Platform: any
 Description-Content-Type: text/markdown
-# UniTok V3
+# UniTok V3: 类SQL数据预处理工具包
+Updated on 2023.11.04
 ## 1. 简介
-UniTok 是一个强大的文本预处理工具包，它提供了一整套的数据预处理工具。UniTok 主要包括两大部分：`UniTok` 和 `UniDep`。`UniTok` 负责统一处理数据，它包括分词器（Tokenizers），数据列（Columns）等组件。`UniDep` 负责数据依赖的处理，包括词汇表（Vocabs），元数据（Meta）等。
+UniTok 是史上第一个类SQL的数据预处理工具包，提供了一整套的数据封装和编辑工具。
+UniTok 主要包括两大组件：负责统一数据处理的`UniTok` 和 负责数据读取和二次编辑的`UniDep`：
+- `UniTok` 通过分词器（Tokenizers）和数据列（Columns）等组件将生数据（Raw Data）进行分词与ID化操作，并最终以numpy数组格式存储为一张数据表。
+- `UniDep` 读取由`UniTok`生成的数据表以及元数据（如词表信息），可以直接与Pytorch的Dataset结合使用，也可以完成二次编辑、和其他数据表合并、导出等操作。
+- 在3.1.9版本后，我们推出`Fut` 组件，它是`UniTok`的替代品，可以更快速地完成数据预处理。
 ## 2. 安装
 使用pip安装：
 ```bash
-pip install unitok>=3.0.11
+pip install unitok>=3.4.8
 ```
 ## 3. 主要功能

{UniTok-3.4.8 → UniTok-3.4.9}/setup.py RENAMED Viewed

@@ -6,7 +6,7 @@ long_description = (this_directory / "README.md").read_text(encoding='utf8')
 setup(
     name='UniTok',
-    version='3.4.8',
+    version='3.4.9',
     keywords=['token', 'tokenizer'],
     description='Unified Tokenizer',
     long_description=long_description,