PyPI - UniTok - Versions diffs - 3.0.6__tar.gz → 3.0.8__tar.gz - Mend

UniTok 3.0.6tar.gz → 3.0.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

{UniTok-3.0.6 → UniTok-3.0.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: UniTok
-Version: 3.0.6
+Version: 3.0.8
 Summary: Unified Tokenizer
 Home-page: https://github.com/Jyonn/UnifiedTokenizer
 Author: Jyonn Liu

{UniTok-3.0.6 → UniTok-3.0.8}/UniTok/meta.py RENAMED Viewed

@@ -40,6 +40,13 @@ class Voc:
             'cols': [col.name for col in self.cols]
         }
+    def merge(self, other):
+        return Voc(
+            name=self.name,
+            size=self.size,
+            cols=list(set(self.cols + other.cols)),
+        )
 class Meta:
     VER = 'UniDep-2.0'

{UniTok-3.0.6 → UniTok-3.0.8}/UniTok/unidep.py RENAMED Viewed

@@ -1,3 +1,4 @@
+import json
 import os
 import random
 import warnings
@@ -120,18 +121,21 @@ class UniDep:
     @classmethod
     def _merge_cols(cls, c1: Dict[str, Col], c2: Dict[str, Col]) -> Dict[str, Col]:
-        for col_name, col in c2.items():
-            if col_name in c1 and c1[col_name] != col:
-                raise ValueError(f'col {col_name} config conflict')
+        for name, col in c2.items():
+            if name in c1 and c1[name] != col:
+                raise ValueError(f'col {name} config conflict')
         return cls._merge(c1, c2)
     @classmethod
     def _merge_vocs(cls, v1: Dict[str, Voc], v2: Dict[str, Voc]) -> Dict[str, Voc]:
-        for vocab_name in v2:
-            vocab_data = v2[vocab_name]
-            if vocab_name in v1 and v1[vocab_name] != vocab_data:
-                raise ValueError(f'vocab {vocab_name} config conflict')
-        return cls._merge(v1, v2)
+        merged = v1.copy()
+        for name, vocab in v2.items():
+            if name in v1:
+                if v1[name] != vocab:
+                    raise ValueError(f'vocab {name} config conflict')
+                vocab = v1[name].merge(vocab)
+            merged[name] = vocab
+        return merged
     def union(self, *depots: 'UniDep'):
         """
@@ -168,6 +172,27 @@ class UniDep:
         self.sample_size = len(self._indexes)
         return self
+    def export(self, store_dir):
+        """
+        export unioned or filtered depot
+        """
+        os.makedirs(store_dir, exist_ok=True)
+        data = dict()
+        for sample in tqdm.tqdm(self, disable=self.silent):
+            for col_name in sample:
+                if col_name not in data:
+                    data[col_name] = []
+                data[col_name].append(sample[col_name])
+        for col_name in data:
+            data[col_name] = np.array(data[col_name])
+        np.save(os.path.join(store_dir, 'data.npy'), data, allow_pickle=True)
+        meta_data = self.meta.get_info()
+        json.dump(meta_data, open(os.path.join(store_dir, 'meta.data.json'), 'w'), indent=2)
     """
     Deprecated properties and methods
     """
@@ -192,7 +217,7 @@ class UniDep:
     def get_vocab_size(self, col_name, as_vocab=False):
         warnings.warn('unidep.get_vocab_size is deprecated (will be removed in 4.x version)', DeprecationWarning)
-        vocab_id = col_name if as_vocab else self.get_vocab(col_name)
+        vocab_id = col_name if as_vocab else self.cols[col_name].voc.name
         return self.vocs[vocab_id].size
     def get_vocab(self, col_name):

{UniTok-3.0.6 → UniTok-3.0.8}/UniTok/unitok.py RENAMED Viewed

@@ -18,6 +18,43 @@ from .vocabs import Vocabs
 class UniTok:
     """
     Unified Tokenizer, which can be used to tokenize different types of data in a DataFrame.
+    Example:
+        >>> import pandas as pd
+        >>> from UniTok import UniTok, Column, Vocab
+        >>>
+        >>> # load data
+        >>> df = pd.read_csv(
+        ... filepath_or_buffer='news-sample.tsv',
+        ... sep='\t',
+        ... names=['nid', 'cat', 'subCat', 'title', 'abs', 'url', 'titEnt', 'absEnt'],
+        ... usecols=['nid', 'cat', 'subCat', 'title', 'abs'],
+        ... )
+        >>>
+        >>> # define tokenizers
+        >>> id_tok = IdTok(name='nid')
+        >>> cat_tok = EntTok(name='cat')
+        >>> text_tok = BertTok(name='eng', vocab_dir='bert-base-uncased')
+        >>>
+        >>> # define UniTok
+        >>> tok = UniTok().add_index_col(name='nid').add_col(Column(
+        ...     name='cat',
+        ...     tok=cat_tok,
+        ... )).add_col(Column(
+        ...     name='subCat',
+        ...     tok=cat_tok,
+        ... ))add_col(Column(
+        ...     name='title',
+        ...     tok=text_tok,
+        ...     max_length=20,
+        ... )).add_col(Column(
+        ...     name='abs',
+        ...     tok=text_tok,
+        ...     max_length=30,
+        ... ))
+        >>>
+        >>> # tokenize
+        >>> tok.read_file(df).tokenize().store_data('news-sample')
     """
     VER = 'v3.0'
@@ -151,36 +188,3 @@ class UniTok:
         )
         json.dump(meta_data, open(os.path.join(store_dir, 'meta.data.json'), 'w'), indent=2)
         return self
-if __name__ == '__main__':
-    df = pd.read_csv(
-        filepath_or_buffer='news-sample.tsv',
-        sep='\t',
-        names=['nid', 'cat', 'subCat', 'title', 'abs', 'url', 'titEnt', 'absEnt'],
-        usecols=['nid', 'cat', 'subCat', 'title', 'abs'],
-    )
-    ut = UniTok()
-    id_tok = IdTok(name='news')
-    cat_tok = EntTok(name='cat')
-    txt_tok = BertTok(name='english', vocab_dir='bert-base-uncased')
-    cat_tok.vocab.reserve(100)
-    ut.add_col(Column(
-        name='nid',
-        tok=id_tok,
-    )).add_col(Column(
-        name='cat',
-        tok=cat_tok,
-    )).add_col(Column(
-        name='subCat',
-        tok=cat_tok,
-    )).add_col(Column(
-        name='title',
-        tok=txt_tok,
-    )).add_col(Column(
-        name='abs',
-        tok=txt_tok,
-    )).read_file(df).tokenize()
-    ut.store_data('news-sample')

{UniTok-3.0.6 → UniTok-3.0.8}/UniTok.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: UniTok
-Version: 3.0.6
+Version: 3.0.8
 Summary: Unified Tokenizer
 Home-page: https://github.com/Jyonn/UnifiedTokenizer
 Author: Jyonn Liu

{UniTok-3.0.6 → UniTok-3.0.8}/setup.py RENAMED Viewed

@@ -6,7 +6,7 @@ long_description = (this_directory / "README.md").read_text()
 setup(
     name='UniTok',
-    version='3.0.6',
+    version='3.0.8',
     keywords=['token', 'tokenizer', 'bert'],
     description='Unified Tokenizer',
     long_description=long_description,