UniTok 4.0.3__tar.gz → 4.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. {UniTok-4.0.3 → UniTok-4.3.0}/PKG-INFO +14 -13
  2. {UniTok-4.0.3 → UniTok-4.3.0}/README.md +12 -11
  3. {UniTok-4.0.3 → UniTok-4.3.0}/UniTok.egg-info/PKG-INFO +14 -13
  4. {UniTok-4.0.3 → UniTok-4.3.0}/UniTok.egg-info/SOURCES.txt +1 -1
  5. {UniTok-4.0.3 → UniTok-4.3.0}/setup.py +2 -2
  6. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/__init__.py +2 -0
  7. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/__main__.py +36 -6
  8. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/job.py +17 -2
  9. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/meta.py +2 -1
  10. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/tokenizer/__init__.py +3 -3
  11. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/tokenizer/base_tokenizer.py +7 -1
  12. UniTok-4.3.0/unitok/tokenizer/glove_tokenizer.py +21 -0
  13. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/tokenizer/transformers_tokenizer.py +18 -8
  14. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/unitok.py +30 -2
  15. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/handler/pkl_handler.py +1 -1
  16. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/hub/hub.py +1 -1
  17. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/vocabulary/vocabulary.py +25 -9
  18. UniTok-4.0.3/unitok/tokenizer/cachable_tokenizer.py +0 -25
  19. {UniTok-4.0.3 → UniTok-4.3.0}/LICENSE +0 -0
  20. {UniTok-4.0.3 → UniTok-4.3.0}/UniTok.egg-info/dependency_links.txt +0 -0
  21. {UniTok-4.0.3 → UniTok-4.3.0}/UniTok.egg-info/entry_points.txt +0 -0
  22. {UniTok-4.0.3 → UniTok-4.3.0}/UniTok.egg-info/requires.txt +0 -0
  23. {UniTok-4.0.3 → UniTok-4.3.0}/UniTok.egg-info/top_level.txt +0 -0
  24. {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/__init__.py +0 -0
  25. {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/__main__.py +0 -0
  26. {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/analysis/__init__.py +0 -0
  27. {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/analysis/lengths.py +0 -0
  28. {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/analysis/plot.py +0 -0
  29. {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/cols.py +0 -0
  30. {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/column.py +0 -0
  31. {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/fut.py +0 -0
  32. {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/global_setting.py +0 -0
  33. {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/meta.py +0 -0
  34. {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/tok/__init__.py +0 -0
  35. {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/tok/bert_tok.py +0 -0
  36. {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/tok/ent_tok.py +0 -0
  37. {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/tok/id_tok.py +0 -0
  38. {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/tok/number_tok.py +0 -0
  39. {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/tok/seq_tok.py +0 -0
  40. {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/tok/split_tok.py +0 -0
  41. {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/tok/tok.py +0 -0
  42. {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/unidep.py +0 -0
  43. {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/unitok.py +0 -0
  44. {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/vocab.py +0 -0
  45. {UniTok-4.0.3 → UniTok-4.3.0}/UniTokv3/vocabs.py +0 -0
  46. {UniTok-4.0.3 → UniTok-4.3.0}/setup.cfg +0 -0
  47. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/selector.py +0 -0
  48. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/status.py +0 -0
  49. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/tokenizer/digit_tokenizer.py +0 -0
  50. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/tokenizer/entity_tokenizer.py +0 -0
  51. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/tokenizer/split_tokenizer.py +0 -0
  52. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/tokenizer/union_tokenizer.py +0 -0
  53. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/tokenizer/unknown_tokenizer.py +0 -0
  54. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/__init__.py +0 -0
  55. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/class_pool.py +0 -0
  56. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/data.py +0 -0
  57. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/function.py +0 -0
  58. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/handler/__init__.py +0 -0
  59. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/handler/json_handler.py +0 -0
  60. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/hub/__init__.py +0 -0
  61. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/hub/param_hub.py +0 -0
  62. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/index_set/__init__.py +0 -0
  63. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/index_set/index_set.py +0 -0
  64. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/index_set/job_set.py +0 -0
  65. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/index_set/tokenizer_set.py +0 -0
  66. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/index_set/vocabulary_set.py +0 -0
  67. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/instance.py +0 -0
  68. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/map.py +0 -0
  69. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/space.py +0 -0
  70. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/symbol.py +0 -0
  71. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/utils/verbose.py +0 -0
  72. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/vocabulary/__init__.py +0 -0
  73. {UniTok-4.0.3 → UniTok-4.3.0}/unitok/vocabulary/counter.py +0 -0
@@ -1,12 +1,12 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: UniTok
3
- Version: 4.0.3
3
+ Version: 4.3.0
4
4
  Summary: Unified Tokenizer
5
5
  Home-page: https://github.com/Jyonn/UnifiedTokenizer
6
6
  Author: Jyonn Liu
7
7
  Author-email: liu@qijiong.work
8
8
  License: MIT Licence
9
- Keywords: token,tokenizer
9
+ Keywords: token,tokenizer,NLP,transformers,glove,bert,llama
10
10
  Platform: any
11
11
  Description-Content-Type: text/markdown
12
12
  License-File: LICENSE
@@ -29,17 +29,18 @@ UniTok is designed to simplify preprocessing by offering reusable components suc
29
29
 
30
30
  ### Changes and Comparisons
31
31
 
32
- | Feature | UniTok v3 | UniTok v4 | Comments |
33
- |-------------------|-------------------------------------------------------------|-----------------------------------------------------|-------------------------------------------------------------------------------|
34
- | `UniTok` class | Solely for tokenization | Manages the entire preprocessing lifecycle | |
35
- | `UniDep` class | Data loading and combining | Removed | V4 combines the functionalities of `UniTok` and `UniDep` into a single class. |
36
- | `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `Job` class. |
37
- | `Job` class | N/A | Defines how a specific column should be tokenized | |
38
- | `Tokenizer` class | Ambiguous return type definition | `return_list` parameter must be of type `bool` | |
39
- | `Tokenizer` class | Only supports `BertTokenizer` for text processing | Supports all Tokenizers in the transformers library | New `TransformersTokenizer` class |
40
- | `analyse` method | Supported | Not supported Currently | |
41
- | `Meta` class | Only for human-friendly displaying | Manager for `Job`, `Tokenizer`, and `Vocab` | |
42
- | `unitok` command | Visualization in the terminal | More colorful and detailed output | |
32
+ | Feature | UniTok v3 | UniTok v4 | Comments |
33
+ |---------------------------------|-------------------------------------------------------------|-----------------------------------------------------|-------------------------------------------------------------------------------|
34
+ | `UniTok` class | Solely for tokenization | Manages the entire preprocessing lifecycle | |
35
+ | `UniDep` class | Data loading and combining | Removed | V4 combines the functionalities of `UniTok` and `UniDep` into a single class. |
36
+ | `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `Job` class. |
37
+ | `Job` class | N/A | Defines how a specific column should be tokenized | |
38
+ | `Tokenizer` class | Ambiguous return type definition | `return_list` parameter must be of type `bool` | |
39
+ | `Tokenizer` class | Only supports `BertTokenizer` for text processing | Supports all Tokenizers in the transformers library | New `TransformersTokenizer` class |
40
+ | `analyse` method | Supported | Not supported Currently | |
41
+ | `Meta` class | Only for human-friendly displaying | Manager for `Job`, `Tokenizer`, and `Vocab` | |
42
+ | `unitok` command | Visualization in the terminal | More colorful and detailed output | |
43
+ | `Vocab` class (unitok >= 4.1.0) | Save and load vocabulary using text files | Save and load vocabulary using pickle files | Avoids issues with special characters in text files |
43
44
 
44
45
  ### How to Migrate the Processed Data
45
46
 
@@ -16,17 +16,18 @@ UniTok is designed to simplify preprocessing by offering reusable components suc
16
16
 
17
17
  ### Changes and Comparisons
18
18
 
19
- | Feature | UniTok v3 | UniTok v4 | Comments |
20
- |-------------------|-------------------------------------------------------------|-----------------------------------------------------|-------------------------------------------------------------------------------|
21
- | `UniTok` class | Solely for tokenization | Manages the entire preprocessing lifecycle | |
22
- | `UniDep` class | Data loading and combining | Removed | V4 combines the functionalities of `UniTok` and `UniDep` into a single class. |
23
- | `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `Job` class. |
24
- | `Job` class | N/A | Defines how a specific column should be tokenized | |
25
- | `Tokenizer` class | Ambiguous return type definition | `return_list` parameter must be of type `bool` | |
26
- | `Tokenizer` class | Only supports `BertTokenizer` for text processing | Supports all Tokenizers in the transformers library | New `TransformersTokenizer` class |
27
- | `analyse` method | Supported | Not supported Currently | |
28
- | `Meta` class | Only for human-friendly displaying | Manager for `Job`, `Tokenizer`, and `Vocab` | |
29
- | `unitok` command | Visualization in the terminal | More colorful and detailed output | |
19
+ | Feature | UniTok v3 | UniTok v4 | Comments |
20
+ |---------------------------------|-------------------------------------------------------------|-----------------------------------------------------|-------------------------------------------------------------------------------|
21
+ | `UniTok` class | Solely for tokenization | Manages the entire preprocessing lifecycle | |
22
+ | `UniDep` class | Data loading and combining | Removed | V4 combines the functionalities of `UniTok` and `UniDep` into a single class. |
23
+ | `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `Job` class. |
24
+ | `Job` class | N/A | Defines how a specific column should be tokenized | |
25
+ | `Tokenizer` class | Ambiguous return type definition | `return_list` parameter must be of type `bool` | |
26
+ | `Tokenizer` class | Only supports `BertTokenizer` for text processing | Supports all Tokenizers in the transformers library | New `TransformersTokenizer` class |
27
+ | `analyse` method | Supported | Not supported Currently | |
28
+ | `Meta` class | Only for human-friendly displaying | Manager for `Job`, `Tokenizer`, and `Vocab` | |
29
+ | `unitok` command | Visualization in the terminal | More colorful and detailed output | |
30
+ | `Vocab` class (unitok >= 4.1.0) | Save and load vocabulary using text files | Save and load vocabulary using pickle files | Avoids issues with special characters in text files |
30
31
 
31
32
  ### How to Migrate the Processed Data
32
33
 
@@ -1,12 +1,12 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: UniTok
3
- Version: 4.0.3
3
+ Version: 4.3.0
4
4
  Summary: Unified Tokenizer
5
5
  Home-page: https://github.com/Jyonn/UnifiedTokenizer
6
6
  Author: Jyonn Liu
7
7
  Author-email: liu@qijiong.work
8
8
  License: MIT Licence
9
- Keywords: token,tokenizer
9
+ Keywords: token,tokenizer,NLP,transformers,glove,bert,llama
10
10
  Platform: any
11
11
  Description-Content-Type: text/markdown
12
12
  License-File: LICENSE
@@ -29,17 +29,18 @@ UniTok is designed to simplify preprocessing by offering reusable components suc
29
29
 
30
30
  ### Changes and Comparisons
31
31
 
32
- | Feature | UniTok v3 | UniTok v4 | Comments |
33
- |-------------------|-------------------------------------------------------------|-----------------------------------------------------|-------------------------------------------------------------------------------|
34
- | `UniTok` class | Solely for tokenization | Manages the entire preprocessing lifecycle | |
35
- | `UniDep` class | Data loading and combining | Removed | V4 combines the functionalities of `UniTok` and `UniDep` into a single class. |
36
- | `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `Job` class. |
37
- | `Job` class | N/A | Defines how a specific column should be tokenized | |
38
- | `Tokenizer` class | Ambiguous return type definition | `return_list` parameter must be of type `bool` | |
39
- | `Tokenizer` class | Only supports `BertTokenizer` for text processing | Supports all Tokenizers in the transformers library | New `TransformersTokenizer` class |
40
- | `analyse` method | Supported | Not supported Currently | |
41
- | `Meta` class | Only for human-friendly displaying | Manager for `Job`, `Tokenizer`, and `Vocab` | |
42
- | `unitok` command | Visualization in the terminal | More colorful and detailed output | |
32
+ | Feature | UniTok v3 | UniTok v4 | Comments |
33
+ |---------------------------------|-------------------------------------------------------------|-----------------------------------------------------|-------------------------------------------------------------------------------|
34
+ | `UniTok` class | Solely for tokenization | Manages the entire preprocessing lifecycle | |
35
+ | `UniDep` class | Data loading and combining | Removed | V4 combines the functionalities of `UniTok` and `UniDep` into a single class. |
36
+ | `Column` class | Column name is for both the original and tokenized datasets | N/A | V4 introduces a `Job` class. |
37
+ | `Job` class | N/A | Defines how a specific column should be tokenized | |
38
+ | `Tokenizer` class | Ambiguous return type definition | `return_list` parameter must be of type `bool` | |
39
+ | `Tokenizer` class | Only supports `BertTokenizer` for text processing | Supports all Tokenizers in the transformers library | New `TransformersTokenizer` class |
40
+ | `analyse` method | Supported | Not supported Currently | |
41
+ | `Meta` class | Only for human-friendly displaying | Manager for `Job`, `Tokenizer`, and `Vocab` | |
42
+ | `unitok` command | Visualization in the terminal | More colorful and detailed output | |
43
+ | `Vocab` class (unitok >= 4.1.0) | Save and load vocabulary using text files | Save and load vocabulary using pickle files | Avoids issues with special characters in text files |
43
44
 
44
45
  ### How to Migrate the Processed Data
45
46
 
@@ -38,9 +38,9 @@ unitok/status.py
38
38
  unitok/unitok.py
39
39
  unitok/tokenizer/__init__.py
40
40
  unitok/tokenizer/base_tokenizer.py
41
- unitok/tokenizer/cachable_tokenizer.py
42
41
  unitok/tokenizer/digit_tokenizer.py
43
42
  unitok/tokenizer/entity_tokenizer.py
43
+ unitok/tokenizer/glove_tokenizer.py
44
44
  unitok/tokenizer/split_tokenizer.py
45
45
  unitok/tokenizer/transformers_tokenizer.py
46
46
  unitok/tokenizer/union_tokenizer.py
@@ -6,8 +6,8 @@ long_description = (this_directory / "README.md").read_text(encoding='utf8')
6
6
 
7
7
  setup(
8
8
  name='UniTok',
9
- version='4.0.3',
10
- keywords=['token', 'tokenizer'],
9
+ version='4.3.0',
10
+ keywords=['token', 'tokenizer', 'NLP', 'transformers', 'glove', 'bert', 'llama'],
11
11
  description='Unified Tokenizer',
12
12
  long_description=long_description,
13
13
  long_description_content_type='text/markdown',
@@ -9,6 +9,7 @@ from unitok.tokenizer import BaseTokenizer, TokenizerHub
9
9
  from unitok.tokenizer import EntityTokenizer, EntitiesTokenizer
10
10
  from unitok.tokenizer import TransformersTokenizer, BertTokenizer
11
11
  from unitok.tokenizer import SplitTokenizer, DigitTokenizer, DigitsTokenizer
12
+ from unitok.tokenizer import GloVeTokenizer
12
13
  from unitok.job import Job, JobHub
13
14
 
14
15
  from unitok.utils.index_set import IndexSet, VocabSet, TokenizerSet, JobSet
@@ -29,6 +30,7 @@ __all__ = [
29
30
  'EntityTokenizer', 'EntitiesTokenizer',
30
31
  'TransformersTokenizer', 'BertTokenizer',
31
32
  'SplitTokenizer', 'DigitTokenizer', 'DigitsTokenizer',
33
+ 'GloVeTokenizer',
32
34
  'Job', 'JobHub',
33
35
  'IndexSet', 'VocabSet', 'TokenizerSet', 'JobSet',
34
36
  'Meta',
@@ -1,7 +1,9 @@
1
1
  import argparse
2
2
 
3
3
  import pandas as pd
4
+ from pigmento import pnt
4
5
 
6
+ from unitok import Vocab
5
7
  from unitok.tokenizer import BaseTokenizer
6
8
  from unitok.unitok import UniTok
7
9
  from unitok.utils.class_pool import ClassPool
@@ -29,7 +31,7 @@ def integrate():
29
31
  if arg.startswith('--t.'):
30
32
  current_param = arg[4:]
31
33
  elif arg.startswith('--tokenizer.'):
32
- current_param = arg[11:]
34
+ current_param = arg[12:]
33
35
 
34
36
  if args.file.endswith('.csv') or args.file.endswith('.tsv'):
35
37
  df = pd.read_csv(args.file, sep='\t')
@@ -39,17 +41,32 @@ def integrate():
39
41
  raise ValueError(f'Unsupported file format: {args.file}')
40
42
 
41
43
  with UniTok.load(args.path, tokenizer_lib=args.lib) as ut:
44
+ tokenizer = None
45
+
42
46
  if args.tokenizer_id:
43
47
  for t in ut.meta.tokenizers: # type: BaseTokenizer
44
48
  if t.get_tokenizer_id() == args.tokenizer_id:
45
49
  tokenizer = t
46
50
  break
47
51
  else:
48
- raise ValueError(f'Unknown tokenizer id: {args.tokenizer_id}')
49
- else:
50
- assert args.tokenizer is not None and args.vocab is not None, 'Tokenizer classname and vocabulary must be specified'
52
+ pnt(f'Unknown tokenizer id: {args.tokenizer_id}, will create a new tokenizer')
53
+ tokenizer_params['tokenizer_id'] = args.tokenizer_id
54
+
55
+ if not tokenizer:
56
+ if args.tokenizer is None and args.vocab is None:
57
+ raise ValueError('Tokenizer classname and vocabulary must be specified')
58
+
59
+ if args.vocab.endswith('.vocab'):
60
+ if '/' in args.vocab:
61
+ vocab_path, vocab_name = args.vocab.rsplit('/', 1)
62
+ else:
63
+ vocab_path, vocab_name = '.', args.vocab
64
+ vocab_name = vocab_name[:-6]
65
+ args.vocab = Vocab(vocab_name).load(vocab_path)
66
+
51
67
  tokenizers = ClassPool.tokenizers(args.lib)
52
- assert args.tokenizer in tokenizers, f'Unknown tokenizer: {args.tokenizer}. Available tokenizers: {tokenizers.keys()}'
68
+ if args.tokenizer not in tokenizers:
69
+ raise ValueError(f'Unknown tokenizer: {args.tokenizer}. Available tokenizers: {tokenizers.keys()}')
53
70
  tokenizer = tokenizers[args.tokenizer](vocab=args.vocab, **tokenizer_params)
54
71
 
55
72
  ut.add_job(tokenizer=tokenizer, column=args.column, name=args.name, truncate=args.truncate)
@@ -65,14 +82,27 @@ def summarize():
65
82
  ut.summarize()
66
83
 
67
84
 
85
+ def remove():
86
+ parser = argparse.ArgumentParser()
87
+ parser.add_argument('path', type=str, default='.', help='path to a unitok data directory')
88
+ parser.add_argument('--name', type=str, help='job name to remove')
89
+ args, _ = parser.parse_known_args()
90
+
91
+ with UniTok.load(args.path) as ut:
92
+ ut.remove_job(args.name)
93
+ ut.save(args.path)
94
+
95
+
68
96
  def main():
69
97
  parser = argparse.ArgumentParser()
70
- parser.add_argument('--action', '-a', type=str, default='summarize', choices=['summarize', 'integrate'])
98
+ parser.add_argument('--action', '-a', type=str, default='summarize', choices=['summarize', 'integrate', 'remove'])
71
99
 
72
100
  args, _ = parser.parse_known_args()
73
101
  action = args.action
74
102
 
75
103
  if action == 'integrate':
76
104
  integrate()
105
+ elif action == 'remove':
106
+ remove()
77
107
  else:
78
108
  summarize()
@@ -1,3 +1,6 @@
1
+ from typing import Union
2
+
3
+ from unitok import TokenizerHub, VocabHub
1
4
  from unitok.tokenizer.union_tokenizer import UnionTokenizer
2
5
 
3
6
  from unitok.tokenizer import BaseTokenizer
@@ -8,7 +11,7 @@ from unitok.utils.hub import Hub
8
11
  class Job:
9
12
  def __init__(
10
13
  self,
11
- tokenizer: BaseTokenizer,
14
+ tokenizer: Union[BaseTokenizer, str],
12
15
  column: str,
13
16
  name: str = None,
14
17
  truncate: int = None,
@@ -16,7 +19,13 @@ class Job:
16
19
  key: bool = False,
17
20
  max_len: int = 0,
18
21
  ):
22
+ if isinstance(tokenizer, str):
23
+ if TokenizerHub.has(tokenizer):
24
+ tokenizer = TokenizerHub.get(tokenizer)
25
+ else:
26
+ raise ValueError(f"Tokenizer {tokenizer} not found in the tokenizer hub.")
19
27
  self.tokenizer: BaseTokenizer = tokenizer
28
+
20
29
  self.column: str = column
21
30
  self.name: str = name
22
31
  self.truncate: int = truncate
@@ -26,7 +35,8 @@ class Job:
26
35
  self.max_len = max_len
27
36
  self.from_union = isinstance(self.tokenizer, UnionTokenizer)
28
37
 
29
- JobHub.add(self.name, self)
38
+ JobHub.add(self)
39
+ VocabHub.add(self.tokenizer.vocab)
30
40
 
31
41
  @property
32
42
  def return_list(self):
@@ -77,3 +87,8 @@ class Job:
77
87
 
78
88
  class JobHub(Hub[Job]):
79
89
  _instance = Instance(compulsory_space=True)
90
+
91
+ @classmethod
92
+ def add(cls, key, obj: Job = None):
93
+ key, obj = key.name, key
94
+ return super().add(key, obj)
@@ -15,7 +15,7 @@ from unitok.vocabulary import Vocab, VocabHub
15
15
 
16
16
 
17
17
  class Meta:
18
- version = 'unidep-v4beta'
18
+ version = 'unidep-v4'
19
19
 
20
20
  def __init__(self):
21
21
  self.note = ('Not compatible with unitok-v3 or lower version, '
@@ -116,6 +116,7 @@ class Meta:
116
116
  meta.vocabularies = VocabSet({cls.parse_vocabulary(**v).load(save_dir) for v in kwargs.get('vocabularies')})
117
117
  meta.tokenizers = TokenizerSet({cls.parse_tokenizer(**t) for t in kwargs.get('tokenizers')})
118
118
  meta.jobs = JobSet({cls.parse_job(**j) for j in kwargs.get('jobs')})
119
+ meta.version = kwargs.get('version')
119
120
 
120
121
  return meta
121
122
 
@@ -1,6 +1,6 @@
1
1
  from unitok.tokenizer.base_tokenizer import BaseTokenizer, TokenizerHub
2
- from unitok.tokenizer.cachable_tokenizer import CachableTokenizer
3
2
  from unitok.tokenizer.entity_tokenizer import EntityTokenizer, EntitiesTokenizer
3
+ from unitok.tokenizer.glove_tokenizer import GloVeTokenizer
4
4
  from unitok.tokenizer.transformers_tokenizer import TransformersTokenizer, BertTokenizer
5
5
  from unitok.tokenizer.split_tokenizer import SplitTokenizer
6
6
  from unitok.tokenizer.digit_tokenizer import DigitTokenizer, DigitsTokenizer
@@ -8,7 +8,6 @@ from unitok.tokenizer.digit_tokenizer import DigitTokenizer, DigitsTokenizer
8
8
 
9
9
  __all__ = [
10
10
  BaseTokenizer,
11
- CachableTokenizer,
12
11
  EntityTokenizer,
13
12
  EntitiesTokenizer,
14
13
  TransformersTokenizer,
@@ -16,5 +15,6 @@ __all__ = [
16
15
  SplitTokenizer,
17
16
  DigitTokenizer,
18
17
  DigitsTokenizer,
19
- TokenizerHub
18
+ GloVeTokenizer,
19
+ TokenizerHub,
20
20
  ]
@@ -28,7 +28,8 @@ class BaseTokenizer(abc.ABC):
28
28
 
29
29
  self._tokenizer_id = tokenizer_id
30
30
 
31
- TokenizerHub.add(self.get_tokenizer_id(), self)
31
+ TokenizerHub.add(self)
32
+ VocabHub.add(self.vocab)
32
33
 
33
34
  def get_tokenizer_id(self):
34
35
  if self._tokenizer_id is None:
@@ -81,3 +82,8 @@ class BaseTokenizer(abc.ABC):
81
82
 
82
83
  class TokenizerHub(Hub[BaseTokenizer]):
83
84
  _instance = Instance()
85
+
86
+ @classmethod
87
+ def add(cls, key, obj: BaseTokenizer = None):
88
+ key, obj = key.get_tokenizer_id(), key
89
+ return super().add(key, obj)
@@ -0,0 +1,21 @@
1
+ import nltk
2
+
3
+ from unitok.vocabulary import VocabHub
4
+ from unitok.tokenizer import BaseTokenizer
5
+
6
+
7
+ class GloVeTokenizer(BaseTokenizer):
8
+ return_list = True
9
+ param_list = ['language']
10
+
11
+ def __init__(self, vocab, language='english', **kwargs):
12
+ if isinstance(vocab, str) and not VocabHub.has(vocab):
13
+ raise ValueError('GloVeTokenizer requires a pre-filled Vocab object that stores valid tokens')
14
+
15
+ super().__init__(vocab=vocab, **kwargs)
16
+
17
+ self.language = language
18
+
19
+ def __call__(self, obj):
20
+ objs = nltk.tokenize.word_tokenize(obj.lower(), language=self.language)
21
+ return [self.vocab[o] for o in objs if o in self.vocab]
@@ -1,31 +1,37 @@
1
1
  from typing import Union
2
2
 
3
+ from pigmento import pnt
3
4
  from transformers import AutoTokenizer
4
5
 
5
- from UniTokv3.vocab import Vocab
6
- from unitok.tokenizer import CachableTokenizer
6
+ from unitok.vocabulary import Vocab
7
+ from unitok.tokenizer import BaseTokenizer
7
8
 
8
9
 
9
- class TransformersTokenizer(CachableTokenizer):
10
+ class TransformersTokenizer(BaseTokenizer):
10
11
  return_list = True
11
- param_list = ['key']
12
12
 
13
- def __init__(self, vocab: Union[str, Vocab], tokenizer_id: str = None, use_cache=False, key: str = None, **kwargs):
14
- super().__init__(vocab=vocab, tokenizer_id=tokenizer_id, use_cache=use_cache)
13
+ def __init__(self, vocab: Union[str, Vocab], tokenizer_id: str = None, key: str = None, **kwargs):
14
+ super().__init__(vocab=vocab, tokenizer_id=tokenizer_id)
15
15
  self.key = key
16
16
 
17
17
  self.kwargs = kwargs
18
+ self.param_list = ['key']
18
19
  self.param_list.extend(list(kwargs.keys()))
19
20
 
20
21
  self.tokenizer = AutoTokenizer.from_pretrained(self.key, **self.kwargs)
21
22
  self.vocab.extend(self._generate_token_list())
22
23
 
23
24
  def _generate_token_list(self):
25
+ if not hasattr(self.tokenizer, 'vocab'):
26
+ pnt(f'transformer({self.key}): does not provide vocabulary, generating placeholders instead')
27
+ return list(range(self.tokenizer.vocab_size))
28
+
24
29
  tokens = self.tokenizer.vocab
25
30
  if isinstance(tokens, list):
26
31
  return tokens
27
32
  if not isinstance(tokens, dict):
28
- raise ValueError(f'transformer({self.key}): unsupported type of vocabulary')
33
+ pnt(f'transformer({self.key}): unsupported type of vocabulary, generating placeholders instead')
34
+ return list(range(self.tokenizer.vocab_size))
29
35
 
30
36
  num_tokens = len(tokens)
31
37
  token_ids = list(tokens.values())
@@ -45,11 +51,15 @@ class TransformersTokenizer(CachableTokenizer):
45
51
 
46
52
  def __call__(self, obj):
47
53
  tokens = self.tokenizer.tokenize(obj)
48
- return super().__call__(tokens)
54
+ tokens = self.tokenizer.convert_tokens_to_ids(tokens)
55
+ for token in tokens:
56
+ self.vocab.counter(token)
57
+ return tokens
49
58
 
50
59
 
51
60
  class BertTokenizer(TransformersTokenizer):
52
61
  param_list = []
53
62
 
54
63
  def __init__(self, **kwargs):
64
+ kwargs.pop('key', None)
55
65
  super().__init__(key='bert-base-uncased', **kwargs)
@@ -173,7 +173,7 @@ class UniTok(Status):
173
173
  if tokenizer.return_list:
174
174
  raise AttributeError('Column content of the key job should be tokenized into atomic value')
175
175
  if self.key_job:
176
- raise ValueError(f'key key already exists: {self.key_job.name}')
176
+ raise ValueError(f'Key column already exists: {self.key_job.name}')
177
177
  self.key_job = job
178
178
 
179
179
  @Status.require_not_organized
@@ -282,7 +282,10 @@ class UniTok(Status):
282
282
 
283
283
  # Prepare introduction header
284
284
  introduction_header = Text.assemble(
285
- (f"UniTok ({self.meta.parse_version(self.meta.version)})\n", "bold cyan"),
285
+ (
286
+ f"UniTok (v{self.meta.parse_version(Meta.version)}), "
287
+ f"Data (v{self.meta.parse_version(self.meta.version)})\n",
288
+ "bold cyan"),
286
289
  (f"Sample Size: {self._sample_size}\n", "green"),
287
290
  (f"ID Column: {self.key_job.name}\n", "magenta"),
288
291
  style="dim"
@@ -462,3 +465,28 @@ class UniTok(Status):
462
465
 
463
466
  job.max_len = max_len
464
467
  self.data[job.name] = series
468
+
469
+ def remove_job(self, job: Union[Job, str]):
470
+ if isinstance(job, str):
471
+ job = self.meta.jobs[job]
472
+
473
+ if job.key:
474
+ raise ValueError('key job cannot be removed')
475
+
476
+ self.meta.jobs.remove(job)
477
+
478
+ tokenizer = job.tokenizer
479
+ for j in self.meta.jobs:
480
+ if j.tokenizer == tokenizer:
481
+ break
482
+ else:
483
+ self.meta.tokenizers.remove(tokenizer)
484
+ vocab = tokenizer.vocab
485
+ for t in self.meta.tokenizers:
486
+ if t.vocab == vocab:
487
+ break
488
+ else:
489
+ self.meta.vocabularies.remove(vocab)
490
+
491
+ if job.is_processed:
492
+ self.data.pop(job.name)
@@ -14,6 +14,6 @@ class PickleHandler:
14
14
  return pickle.load(open(path, "rb"))
15
15
 
16
16
  @staticmethod
17
- def save(data: dict, path: str):
17
+ def save(data: any, path: str):
18
18
  with open(path, "wb") as f:
19
19
  pickle.dump(data, cast(SupportsWrite, f))
@@ -11,7 +11,7 @@ class Hub(abc.ABC, Generic[T]):
11
11
  _instance: Instance
12
12
 
13
13
  @classmethod
14
- def add(cls, key, obj: T):
14
+ def add(cls, key, obj: T = None):
15
15
  instance = cls._instance.current()
16
16
  if key in instance and instance[key] is not obj:
17
17
  raise ValueError(f'Conflict object declaration: {obj} and {instance[key]}')
@@ -1,5 +1,7 @@
1
1
  import os
2
+ from typing import Optional, Union
2
3
 
4
+ from unitok import PickleHandler
3
5
  from unitok.utils import Map, Instance
4
6
  from unitok.utils.hub import Hub
5
7
  from unitok.vocabulary.counter import Counter
@@ -17,7 +19,7 @@ class Vocabulary:
17
19
  self._editable = True # whether vocab is editable
18
20
  self.counter = Counter()
19
21
 
20
- VocabularyHub.add(self.name, self)
22
+ VocabularyHub.add(self)
21
23
 
22
24
  def equals(self, other: 'Vocabulary'):
23
25
  return self.name == other.name and len(self) == len(other)
@@ -42,7 +44,7 @@ class Vocabulary:
42
44
  """
43
45
  return [self.append(obj) for obj in objs]
44
46
 
45
- def append(self, obj, oov_token=None):
47
+ def append(self, obj, oov_token: Optional[Union[int, str]] = None):
46
48
  obj = str(obj)
47
49
  if obj not in self.o2i:
48
50
  if '\n' in obj:
@@ -51,7 +53,11 @@ class Vocabulary:
51
53
  if not self._editable:
52
54
  if oov_token is None:
53
55
  raise ValueError(f'the fixed vocab {self.name} is not allowed to add new token ({obj})')
54
- return oov_token
56
+ if isinstance(oov_token, str):
57
+ return self[oov_token]
58
+ if len(self) > oov_token >= 0:
59
+ return oov_token
60
+ raise ValueError(f'oov_token ({oov_token}) is not in the vocab')
55
61
 
56
62
  index = len(self)
57
63
  self.o2i[obj] = index
@@ -80,6 +86,9 @@ class Vocabulary:
80
86
  return self.i2o[item]
81
87
  return self.o2i[item]
82
88
 
89
+ def __contains__(self, item: str):
90
+ return item in self.o2i
91
+
83
92
  def __str__(self):
84
93
  return f'Vocabulary({self.name}, vocab_size={len(self)})'
85
94
 
@@ -87,6 +96,10 @@ class Vocabulary:
87
96
  Editable Methods
88
97
  """
89
98
 
99
+ @property
100
+ def editable(self):
101
+ return self._editable
102
+
90
103
  def allow_edit(self):
91
104
  self._editable = True
92
105
  return self
@@ -112,8 +125,8 @@ class Vocabulary:
112
125
  Save & Load Methods
113
126
  """
114
127
 
115
- def filepath(self, store_dir):
116
- return os.path.join(store_dir, self.filename)
128
+ def filepath(self, save_dir):
129
+ return os.path.join(save_dir, self.filename)
117
130
 
118
131
  @property
119
132
  def filename(self):
@@ -124,8 +137,7 @@ class Vocabulary:
124
137
  save_dir = self.filepath(save_dir)
125
138
 
126
139
  self.o2i, self.i2o = {}, {}
127
- with open(save_dir, 'r') as f:
128
- objs = f.read().strip().split('\n')
140
+ objs = PickleHandler.load(save_dir)
129
141
  for index, obj in enumerate(objs):
130
142
  self.o2i[obj] = index
131
143
  self.i2o[index] = obj
@@ -134,8 +146,7 @@ class Vocabulary:
134
146
 
135
147
  def save(self, save_dir):
136
148
  store_path = self.filepath(save_dir)
137
- with open(store_path, 'w') as f:
138
- f.write('\n'.join(self))
149
+ PickleHandler.save(list(self), store_path)
139
150
 
140
151
  return self
141
152
 
@@ -148,3 +159,8 @@ class Vocabulary:
148
159
 
149
160
  class VocabularyHub(Hub[Vocabulary]):
150
161
  _instance = Instance()
162
+
163
+ @classmethod
164
+ def add(cls, key, obj: Vocabulary = None):
165
+ key, obj = key.name, key
166
+ return super().add(key, obj)
@@ -1,25 +0,0 @@
1
- from typing import Hashable
2
-
3
- from unitok import warning
4
- from unitok.tokenizer import BaseTokenizer
5
-
6
-
7
- class CachableTokenizer(BaseTokenizer):
8
- def __init__(self, use_cache=False, **kwargs):
9
- super().__init__(**kwargs)
10
-
11
- if not self.return_list and use_cache:
12
- warning(f'Only the tokenizer that return_list=True may need cache, use_cache of {self.get_classname()} will be set to False')
13
- use_cache = False
14
- self._use_cache = use_cache
15
- self._cache = dict()
16
-
17
- def __call__(self, objs):
18
- if self._use_cache and isinstance(objs, Hashable):
19
- if objs in self._cache:
20
- return self._cache[objs]
21
- value = super().__call__(objs)
22
- self._cache[objs] = value
23
- return value
24
-
25
- return super().__call__(objs)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes