gptmed 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gptmed/data_preparation/text/__init__.py +190 -100
- gptmed/data_preparation/text/base_strategy.py +28 -0
- gptmed/data_preparation/text/batch_pdf_to_jsonl.py +274 -0
- gptmed/data_preparation/text/case_normalizer.py +66 -0
- gptmed/data_preparation/text/pdf_processor.py +302 -0
- gptmed/data_preparation/text/pipeline.py +333 -0
- gptmed/data_preparation/text/preprocess_jsonl.py +466 -0
- gptmed/data_preparation/text/punctuation_handler.py +65 -0
- gptmed/data_preparation/text/stopword_remover.py +87 -0
- gptmed/data_preparation/text/text_cleaner.py +74 -0
- gptmed/data_preparation/text/text_statistics.py +137 -0
- gptmed/data_preparation/text/tokenize_jsonl.py +376 -0
- gptmed/data_preparation/text/tokenizer.py +74 -0
- gptmed/data_preparation/text/unicode_normalizer.py +61 -0
- {gptmed-0.6.0.dist-info → gptmed-0.7.0.dist-info}/METADATA +2 -1
- {gptmed-0.6.0.dist-info → gptmed-0.7.0.dist-info}/RECORD +20 -7
- {gptmed-0.6.0.dist-info → gptmed-0.7.0.dist-info}/WHEEL +0 -0
- {gptmed-0.6.0.dist-info → gptmed-0.7.0.dist-info}/entry_points.txt +0 -0
- {gptmed-0.6.0.dist-info → gptmed-0.7.0.dist-info}/licenses/LICENSE +0 -0
- {gptmed-0.6.0.dist-info → gptmed-0.7.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Text statistics extraction strategy
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import Any, Dict
|
|
7
|
+
|
|
8
|
+
from .base_strategy import TextPreprocessingStrategy
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TextStatistics(TextPreprocessingStrategy):
|
|
12
|
+
"""
|
|
13
|
+
Analyzes and extracts statistics from text
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(self):
|
|
17
|
+
"""Initialize text statistics analyzer"""
|
|
18
|
+
self.stats = {
|
|
19
|
+
'analyses_performed': 0,
|
|
20
|
+
}
|
|
21
|
+
self.last_analysis: Dict[str, Any] = {}
|
|
22
|
+
|
|
23
|
+
def process(self, text: str) -> str:
|
|
24
|
+
"""
|
|
25
|
+
Analyze text and return original text unchanged
|
|
26
|
+
(statistics are stored in get_stats())
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
text: Text to analyze
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
Original text unchanged
|
|
33
|
+
"""
|
|
34
|
+
self.analyze(text)
|
|
35
|
+
return text
|
|
36
|
+
|
|
37
|
+
def analyze(self, text: str) -> Dict[str, Any]:
|
|
38
|
+
"""
|
|
39
|
+
Extract comprehensive statistics from text
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
text: Text to analyze
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
Dictionary of statistics
|
|
46
|
+
"""
|
|
47
|
+
if not text:
|
|
48
|
+
self.last_analysis = self._empty_analysis()
|
|
49
|
+
return self.last_analysis
|
|
50
|
+
|
|
51
|
+
# Count basic metrics
|
|
52
|
+
word_count = len(text.split())
|
|
53
|
+
char_count = len(text)
|
|
54
|
+
char_count_no_spaces = len(text.replace(' ', ''))
|
|
55
|
+
line_count = len(text.split('\n'))
|
|
56
|
+
|
|
57
|
+
# Sentence count (approximate)
|
|
58
|
+
sentence_count = len([s for s in re.split(r'[.!?]+', text) if s.strip()])
|
|
59
|
+
|
|
60
|
+
# Average metrics
|
|
61
|
+
avg_word_length = char_count_no_spaces / word_count if word_count > 0 else 0
|
|
62
|
+
avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0
|
|
63
|
+
|
|
64
|
+
# Unique words
|
|
65
|
+
words = text.lower().split()
|
|
66
|
+
unique_words = len(set(words))
|
|
67
|
+
|
|
68
|
+
# Vocabulary richness
|
|
69
|
+
vocabulary_richness = unique_words / word_count if word_count > 0 else 0
|
|
70
|
+
|
|
71
|
+
# Punctuation counts
|
|
72
|
+
punctuation_count = len([c for c in text if c in '.,!?;:'])
|
|
73
|
+
|
|
74
|
+
# Number extraction
|
|
75
|
+
numbers = re.findall(r'\d+', text)
|
|
76
|
+
number_count = len(numbers)
|
|
77
|
+
|
|
78
|
+
# Capitalized words
|
|
79
|
+
capitalized_words = len([w for w in text.split() if w and w[0].isupper()])
|
|
80
|
+
|
|
81
|
+
self.last_analysis = {
|
|
82
|
+
'word_count': word_count,
|
|
83
|
+
'char_count': char_count,
|
|
84
|
+
'char_count_no_spaces': char_count_no_spaces,
|
|
85
|
+
'line_count': line_count,
|
|
86
|
+
'sentence_count': sentence_count,
|
|
87
|
+
'avg_word_length': round(avg_word_length, 2),
|
|
88
|
+
'avg_sentence_length': round(avg_sentence_length, 2),
|
|
89
|
+
'unique_words': unique_words,
|
|
90
|
+
'vocabulary_richness': round(vocabulary_richness, 3),
|
|
91
|
+
'punctuation_count': punctuation_count,
|
|
92
|
+
'number_count': number_count,
|
|
93
|
+
'capitalized_words': capitalized_words,
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
self.stats['analyses_performed'] += 1
|
|
97
|
+
return self.last_analysis
|
|
98
|
+
|
|
99
|
+
def _empty_analysis(self) -> Dict[str, Any]:
|
|
100
|
+
"""Return empty analysis dictionary"""
|
|
101
|
+
return {
|
|
102
|
+
'word_count': 0,
|
|
103
|
+
'char_count': 0,
|
|
104
|
+
'char_count_no_spaces': 0,
|
|
105
|
+
'line_count': 0,
|
|
106
|
+
'sentence_count': 0,
|
|
107
|
+
'avg_word_length': 0.0,
|
|
108
|
+
'avg_sentence_length': 0.0,
|
|
109
|
+
'unique_words': 0,
|
|
110
|
+
'vocabulary_richness': 0.0,
|
|
111
|
+
'punctuation_count': 0,
|
|
112
|
+
'number_count': 0,
|
|
113
|
+
'capitalized_words': 0,
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
def get_text_stats(self, text: str) -> Dict[str, Any]:
|
|
117
|
+
"""
|
|
118
|
+
Convenience method to analyze text and return stats immediately
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
text: Text to analyze
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
Dictionary of statistics
|
|
125
|
+
"""
|
|
126
|
+
return self.analyze(text)
|
|
127
|
+
|
|
128
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
129
|
+
"""Get analyzer statistics"""
|
|
130
|
+
result = self.stats.copy()
|
|
131
|
+
result['last_analysis'] = self.last_analysis.copy()
|
|
132
|
+
return result
|
|
133
|
+
|
|
134
|
+
def reset_stats(self) -> None:
|
|
135
|
+
"""Reset statistics"""
|
|
136
|
+
self.stats = {'analyses_performed': 0}
|
|
137
|
+
self.last_analysis = {}
|
|
@@ -0,0 +1,376 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Parallel Tokenization Pipeline
|
|
3
|
+
|
|
4
|
+
Takes preprocessed JSONL and generates tokens for model training.
|
|
5
|
+
Supports parallel processing with ThreadPoolExecutor for efficient batch processing.
|
|
6
|
+
|
|
7
|
+
Supports multiple tokenization methods:
|
|
8
|
+
1. Hugging Face Transformers (AutoTokenizer)
|
|
9
|
+
2. SentencePiece
|
|
10
|
+
3. Built-in Tokenizer from data_preparation module
|
|
11
|
+
|
|
12
|
+
Parallel Processing:
|
|
13
|
+
- Automatically scales based on dataset size
|
|
14
|
+
- ThreadPoolExecutor for I/O-bound tokenization
|
|
15
|
+
- Separate outputs for individual documents and merged files
|
|
16
|
+
|
|
17
|
+
Usage:
|
|
18
|
+
python3 tokenize_jsonl.py \
|
|
19
|
+
--input-file ./output/full_preprocessed.jsonl \
|
|
20
|
+
--method huggingface \
|
|
21
|
+
--model gpt2 \
|
|
22
|
+
--workers 4 \
|
|
23
|
+
--output-dir ./output/tokens
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
import sys
|
|
27
|
+
import json
|
|
28
|
+
import logging
|
|
29
|
+
from pathlib import Path
|
|
30
|
+
from typing import Dict, List, Optional, Any, Tuple
|
|
31
|
+
from dataclasses import dataclass, asdict
|
|
32
|
+
import time
|
|
33
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
34
|
+
import os
|
|
35
|
+
|
|
36
|
+
# Adjust path
|
|
37
|
+
sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
|
|
38
|
+
|
|
39
|
+
from gptmed.data_preparation.text import Tokenizer as CustomTokenizer
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# Configure logging
|
|
43
|
+
logging.basicConfig(
|
|
44
|
+
level=logging.INFO,
|
|
45
|
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
46
|
+
)
|
|
47
|
+
logger = logging.getLogger(__name__)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class SimplifiedTokenizedRecord:
|
|
52
|
+
"""Simplified tokenized record - only final tokens"""
|
|
53
|
+
filename: str
|
|
54
|
+
tokens: List[int] # Token IDs
|
|
55
|
+
token_count: int
|
|
56
|
+
tokenizer_method: str
|
|
57
|
+
status: str
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class ParallelJSONLTokenizer:
|
|
61
|
+
"""Parallel tokenization for preprocessed JSONL files"""
|
|
62
|
+
|
|
63
|
+
def __init__(
|
|
64
|
+
self,
|
|
65
|
+
input_file: str,
|
|
66
|
+
output_dir: str = None,
|
|
67
|
+
method: str = 'huggingface',
|
|
68
|
+
model_name: Optional[str] = None,
|
|
69
|
+
workers: Optional[int] = None,
|
|
70
|
+
):
|
|
71
|
+
"""
|
|
72
|
+
Initialize parallel tokenizer
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
input_file: Input preprocessed JSONL file
|
|
76
|
+
output_dir: Output directory for tokens
|
|
77
|
+
method: Tokenization method (custom/huggingface/sentencepiece)
|
|
78
|
+
model_name: Model name for huggingface/sentencepiece
|
|
79
|
+
workers: Number of worker threads (auto-detect if None)
|
|
80
|
+
"""
|
|
81
|
+
self.input_file = Path(input_file)
|
|
82
|
+
|
|
83
|
+
if output_dir is None:
|
|
84
|
+
output_dir = self.input_file.parent / 'tokens'
|
|
85
|
+
|
|
86
|
+
self.output_dir = Path(output_dir)
|
|
87
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
88
|
+
|
|
89
|
+
self.method = method
|
|
90
|
+
self.model_name = model_name
|
|
91
|
+
|
|
92
|
+
# Auto-detect workers based on CPU count
|
|
93
|
+
if workers is None:
|
|
94
|
+
workers = max(2, min(4, os.cpu_count() or 2))
|
|
95
|
+
self.workers = workers
|
|
96
|
+
|
|
97
|
+
self.logger = logging.getLogger(self.__class__.__name__)
|
|
98
|
+
|
|
99
|
+
# Initialize tokenizer based on method
|
|
100
|
+
self._initialize_tokenizer()
|
|
101
|
+
|
|
102
|
+
def _initialize_tokenizer(self):
|
|
103
|
+
"""Initialize tokenizer based on method"""
|
|
104
|
+
if self.method == 'custom':
|
|
105
|
+
self.tokenizer = CustomTokenizer(mode='word')
|
|
106
|
+
self.logger.info("Initialized custom tokenizer (word mode)")
|
|
107
|
+
|
|
108
|
+
elif self.method == 'huggingface':
|
|
109
|
+
try:
|
|
110
|
+
from transformers import AutoTokenizer
|
|
111
|
+
if self.model_name is None:
|
|
112
|
+
self.model_name = 'gpt2'
|
|
113
|
+
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
|
114
|
+
self.logger.info(f"Initialized Hugging Face tokenizer ({self.model_name})")
|
|
115
|
+
except ImportError:
|
|
116
|
+
raise ImportError("Install transformers: pip install transformers")
|
|
117
|
+
|
|
118
|
+
elif self.method == 'sentencepiece':
|
|
119
|
+
try:
|
|
120
|
+
import sentencepiece as spm
|
|
121
|
+
if self.model_name is None:
|
|
122
|
+
raise ValueError("SentencePiece requires model_name parameter")
|
|
123
|
+
self.tokenizer = spm.SentencePieceProcessor(model_file=self.model_name)
|
|
124
|
+
self.logger.info(f"Initialized SentencePiece tokenizer ({self.model_name})")
|
|
125
|
+
except ImportError:
|
|
126
|
+
raise ImportError("Install sentencepiece: pip install sentencepiece")
|
|
127
|
+
|
|
128
|
+
else:
|
|
129
|
+
raise ValueError(f"Unknown tokenization method: {self.method}")
|
|
130
|
+
|
|
131
|
+
def load_jsonl(self) -> List[Dict[str, Any]]:
|
|
132
|
+
"""Load preprocessed JSONL"""
|
|
133
|
+
if not self.input_file.exists():
|
|
134
|
+
raise FileNotFoundError(f"Input file not found: {self.input_file}")
|
|
135
|
+
|
|
136
|
+
records = []
|
|
137
|
+
with open(self.input_file, 'r', encoding='utf-8') as f:
|
|
138
|
+
for line_num, line in enumerate(f, 1):
|
|
139
|
+
try:
|
|
140
|
+
record = json.loads(line.strip())
|
|
141
|
+
records.append(record)
|
|
142
|
+
except json.JSONDecodeError as e:
|
|
143
|
+
self.logger.warning(f"Line {line_num}: Invalid JSON - {str(e)}")
|
|
144
|
+
|
|
145
|
+
self.logger.info(f"Loaded {len(records)} records from {self.input_file.name}")
|
|
146
|
+
return records
|
|
147
|
+
|
|
148
|
+
def _tokenize_with_custom(self, text: str) -> List[str]:
|
|
149
|
+
"""Tokenize using custom tokenizer"""
|
|
150
|
+
return self.tokenizer.tokenize(text)
|
|
151
|
+
|
|
152
|
+
def _tokenize_with_huggingface(self, text: str) -> List[int]:
|
|
153
|
+
"""Tokenize using Hugging Face tokenizer - returns token IDs"""
|
|
154
|
+
return self.tokenizer.encode(text, add_special_tokens=False)
|
|
155
|
+
|
|
156
|
+
def _tokenize_with_sentencepiece(self, text: str) -> List[int]:
|
|
157
|
+
"""Tokenize using SentencePiece - returns token IDs"""
|
|
158
|
+
return self.tokenizer.encode(text)
|
|
159
|
+
|
|
160
|
+
def tokenize_record(self, record: Dict[str, Any], index: int = 0) -> Tuple[SimplifiedTokenizedRecord, str]:
|
|
161
|
+
"""
|
|
162
|
+
Tokenize a single record
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
Tuple of (TokenizedRecord, filename_clean)
|
|
166
|
+
"""
|
|
167
|
+
try:
|
|
168
|
+
filename = record.get('filename', f'document_{index}')
|
|
169
|
+
text = record.get('text', '')
|
|
170
|
+
|
|
171
|
+
if not text or not isinstance(text, str) or len(text.strip()) < 10:
|
|
172
|
+
return (
|
|
173
|
+
SimplifiedTokenizedRecord(
|
|
174
|
+
filename=filename,
|
|
175
|
+
tokens=[],
|
|
176
|
+
token_count=0,
|
|
177
|
+
tokenizer_method=self.method,
|
|
178
|
+
status='empty_or_invalid_text'
|
|
179
|
+
),
|
|
180
|
+
filename
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# Tokenize based on method
|
|
184
|
+
if self.method == 'custom':
|
|
185
|
+
tokens = self._tokenize_with_custom(text)
|
|
186
|
+
# Convert to integers if needed
|
|
187
|
+
if tokens and isinstance(tokens[0], str):
|
|
188
|
+
tokens = list(range(len(tokens))) # Fallback
|
|
189
|
+
elif self.method == 'huggingface':
|
|
190
|
+
tokens = self._tokenize_with_huggingface(text)
|
|
191
|
+
elif self.method == 'sentencepiece':
|
|
192
|
+
tokens = self._tokenize_with_sentencepiece(text)
|
|
193
|
+
else:
|
|
194
|
+
tokens = []
|
|
195
|
+
|
|
196
|
+
return (
|
|
197
|
+
SimplifiedTokenizedRecord(
|
|
198
|
+
filename=filename,
|
|
199
|
+
tokens=tokens,
|
|
200
|
+
token_count=len(tokens),
|
|
201
|
+
tokenizer_method=self.method,
|
|
202
|
+
status='success'
|
|
203
|
+
),
|
|
204
|
+
filename
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
except Exception as e:
|
|
208
|
+
self.logger.error(f"Error tokenizing {record.get('filename', 'unknown')}: {str(e)}")
|
|
209
|
+
return (
|
|
210
|
+
SimplifiedTokenizedRecord(
|
|
211
|
+
filename=record.get('filename', f'document_{index}'),
|
|
212
|
+
tokens=[],
|
|
213
|
+
token_count=0,
|
|
214
|
+
tokenizer_method=self.method,
|
|
215
|
+
status=f'error: {str(e)}'
|
|
216
|
+
),
|
|
217
|
+
record.get('filename', f'document_{index}')
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
def save_merged_jsonl(self, records: List[SimplifiedTokenizedRecord]) -> bool:
|
|
221
|
+
"""Save all tokenized records to merged file"""
|
|
222
|
+
try:
|
|
223
|
+
output_file = self.output_dir / 'merged_tokens.jsonl'
|
|
224
|
+
|
|
225
|
+
with open(output_file, 'w', encoding='utf-8') as f:
|
|
226
|
+
for record in records:
|
|
227
|
+
json.dump(asdict(record), f, ensure_ascii=False)
|
|
228
|
+
f.write('\n')
|
|
229
|
+
|
|
230
|
+
self.logger.info(f"✓ Saved {len(records)} merged tokens to {output_file.name}")
|
|
231
|
+
return True
|
|
232
|
+
except Exception as e:
|
|
233
|
+
self.logger.error(f"Error saving merged JSONL: {str(e)}")
|
|
234
|
+
return False
|
|
235
|
+
|
|
236
|
+
def process(self) -> Dict[str, Any]:
|
|
237
|
+
"""Main parallel tokenization pipeline"""
|
|
238
|
+
start_time = time.time()
|
|
239
|
+
|
|
240
|
+
self.logger.info("="*70)
|
|
241
|
+
self.logger.info("Parallel Tokenization Pipeline")
|
|
242
|
+
self.logger.info("="*70)
|
|
243
|
+
self.logger.info(f"Input file: {self.input_file}")
|
|
244
|
+
self.logger.info(f"Output directory: {self.output_dir}")
|
|
245
|
+
self.logger.info(f"Method: {self.method}")
|
|
246
|
+
self.logger.info(f"Workers: {self.workers}")
|
|
247
|
+
if self.model_name:
|
|
248
|
+
self.logger.info(f"Model: {self.model_name}")
|
|
249
|
+
self.logger.info("="*70)
|
|
250
|
+
|
|
251
|
+
# Load records
|
|
252
|
+
records = self.load_jsonl()
|
|
253
|
+
|
|
254
|
+
if not records:
|
|
255
|
+
self.logger.warning("No records to tokenize")
|
|
256
|
+
return {'status': 'failure', 'total_records': 0}
|
|
257
|
+
|
|
258
|
+
# Parallel tokenization
|
|
259
|
+
self.logger.info(f"\nTokenizing {len(records)} records with {self.workers} workers...")
|
|
260
|
+
tokenized_records = []
|
|
261
|
+
|
|
262
|
+
with ThreadPoolExecutor(max_workers=self.workers) as executor:
|
|
263
|
+
# Submit all tasks
|
|
264
|
+
futures = {
|
|
265
|
+
executor.submit(self.tokenize_record, record, idx): idx
|
|
266
|
+
for idx, record in enumerate(records)
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
# Collect results as they complete
|
|
270
|
+
completed = 0
|
|
271
|
+
for future in as_completed(futures):
|
|
272
|
+
try:
|
|
273
|
+
tokenized, filename = future.result()
|
|
274
|
+
tokenized_records.append(tokenized)
|
|
275
|
+
|
|
276
|
+
completed += 1
|
|
277
|
+
if completed % max(1, len(records) // 10) == 0:
|
|
278
|
+
self.logger.info(f"Progress: {completed}/{len(records)} records tokenized")
|
|
279
|
+
|
|
280
|
+
except Exception as e:
|
|
281
|
+
self.logger.error(f"Error processing record: {str(e)}")
|
|
282
|
+
|
|
283
|
+
# Save merged file
|
|
284
|
+
self.logger.info(f"\nSaving merged tokenized records...")
|
|
285
|
+
saved = self.save_merged_jsonl(tokenized_records)
|
|
286
|
+
|
|
287
|
+
# Calculate statistics
|
|
288
|
+
successful = [r for r in tokenized_records if r.status == 'success']
|
|
289
|
+
failed = [r for r in tokenized_records if r.status != 'success']
|
|
290
|
+
|
|
291
|
+
total_time = time.time() - start_time
|
|
292
|
+
|
|
293
|
+
# Statistics
|
|
294
|
+
total_tokens = sum(r.token_count for r in successful)
|
|
295
|
+
avg_tokens_per_file = total_tokens / len(successful) if successful else 0
|
|
296
|
+
|
|
297
|
+
# Print summary
|
|
298
|
+
self.logger.info(f"\n" + "="*70)
|
|
299
|
+
self.logger.info(f"Tokenization Summary")
|
|
300
|
+
self.logger.info(f"="*70)
|
|
301
|
+
self.logger.info(f"Total records: {len(records)}")
|
|
302
|
+
self.logger.info(f"Successfully tokenized: {len(successful)}")
|
|
303
|
+
self.logger.info(f"Failed: {len(failed)}")
|
|
304
|
+
self.logger.info(f"\nToken Statistics:")
|
|
305
|
+
self.logger.info(f" Total tokens: {total_tokens:,}")
|
|
306
|
+
self.logger.info(f" Average tokens per file: {avg_tokens_per_file:.1f}")
|
|
307
|
+
self.logger.info(f"\nOutput File:")
|
|
308
|
+
self.logger.info(f" {self.output_dir}/merged_tokens.jsonl")
|
|
309
|
+
self.logger.info(f"\nTotal time: {total_time:.2f}s")
|
|
310
|
+
self.logger.info(f"Throughput: {len(records)/total_time:.2f} records/sec")
|
|
311
|
+
self.logger.info(f"="*70)
|
|
312
|
+
|
|
313
|
+
return {
|
|
314
|
+
'status': 'success' if saved else 'failure',
|
|
315
|
+
'input_file': str(self.input_file),
|
|
316
|
+
'output_dir': str(self.output_dir),
|
|
317
|
+
'total_records': len(records),
|
|
318
|
+
'successful': len(successful),
|
|
319
|
+
'failed': len(failed),
|
|
320
|
+
'total_tokens': total_tokens,
|
|
321
|
+
'average_tokens_per_record': avg_tokens_per_file,
|
|
322
|
+
'tokenizer_method': self.method,
|
|
323
|
+
'workers_used': self.workers,
|
|
324
|
+
'total_time': total_time,
|
|
325
|
+
'throughput': len(records) / total_time,
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
if __name__ == '__main__':
|
|
332
|
+
import argparse
|
|
333
|
+
|
|
334
|
+
parser = argparse.ArgumentParser(
|
|
335
|
+
description='Parallel tokenize preprocessed JSONL files'
|
|
336
|
+
)
|
|
337
|
+
parser.add_argument(
|
|
338
|
+
'--input-file',
|
|
339
|
+
default='./output/full_preprocessed.jsonl',
|
|
340
|
+
help='Input preprocessed JSONL file'
|
|
341
|
+
)
|
|
342
|
+
parser.add_argument(
|
|
343
|
+
'--output-dir',
|
|
344
|
+
default=None,
|
|
345
|
+
help='Output directory for tokens (auto-generated if not specified)'
|
|
346
|
+
)
|
|
347
|
+
parser.add_argument(
|
|
348
|
+
'--method',
|
|
349
|
+
default='huggingface',
|
|
350
|
+
choices=['custom', 'huggingface', 'sentencepiece'],
|
|
351
|
+
help='Tokenization method (default: huggingface)'
|
|
352
|
+
)
|
|
353
|
+
parser.add_argument(
|
|
354
|
+
'--model',
|
|
355
|
+
default='gpt2',
|
|
356
|
+
help='Model name for huggingface/sentencepiece (default: gpt2)'
|
|
357
|
+
)
|
|
358
|
+
parser.add_argument(
|
|
359
|
+
'--workers',
|
|
360
|
+
type=int,
|
|
361
|
+
default=None,
|
|
362
|
+
help='Number of worker threads (auto-detect if not specified)'
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
args = parser.parse_args()
|
|
366
|
+
|
|
367
|
+
tokenizer = ParallelJSONLTokenizer(
|
|
368
|
+
input_file=args.input_file,
|
|
369
|
+
output_dir=args.output_dir,
|
|
370
|
+
method=args.method,
|
|
371
|
+
model_name=args.model,
|
|
372
|
+
workers=args.workers,
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
result = tokenizer.process()
|
|
376
|
+
sys.exit(0 if result['status'] == 'success' else 1)
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tokenization strategy - converts text to tokens
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import Any, Dict, List
|
|
7
|
+
|
|
8
|
+
from .base_strategy import TextPreprocessingStrategy
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Tokenizer(TextPreprocessingStrategy):
|
|
12
|
+
"""
|
|
13
|
+
Tokenizes text into words or sentences
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(self, mode: str = 'word'):
|
|
17
|
+
"""
|
|
18
|
+
Initialize tokenizer
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
mode: Tokenization mode ('word' or 'sentence')
|
|
22
|
+
"""
|
|
23
|
+
if mode not in ('word', 'sentence'):
|
|
24
|
+
raise ValueError(f"Invalid mode: {mode}. Use 'word' or 'sentence'")
|
|
25
|
+
|
|
26
|
+
self.mode = mode
|
|
27
|
+
self.stats = {
|
|
28
|
+
'mode': mode,
|
|
29
|
+
'tokens_created': 0,
|
|
30
|
+
}
|
|
31
|
+
self.last_tokens: List[str] = []
|
|
32
|
+
|
|
33
|
+
def process(self, text: str) -> str:
|
|
34
|
+
"""
|
|
35
|
+
Tokenize text and return as space-separated tokens
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
text: Text to tokenize
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
Space-separated tokens
|
|
42
|
+
"""
|
|
43
|
+
tokens = self.tokenize(text)
|
|
44
|
+
self.stats['tokens_created'] = len(tokens)
|
|
45
|
+
return ' '.join(tokens)
|
|
46
|
+
|
|
47
|
+
def tokenize(self, text: str) -> List[str]:
|
|
48
|
+
"""
|
|
49
|
+
Tokenize text into list of tokens
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
text: Text to tokenize
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
List of tokens
|
|
56
|
+
"""
|
|
57
|
+
if self.mode == 'word':
|
|
58
|
+
tokens = text.split()
|
|
59
|
+
elif self.mode == 'sentence':
|
|
60
|
+
# Split on sentence-ending punctuation
|
|
61
|
+
tokens = re.split(r'[.!?]+', text)
|
|
62
|
+
tokens = [s.strip() for s in tokens if s.strip()]
|
|
63
|
+
|
|
64
|
+
self.last_tokens = tokens
|
|
65
|
+
self.stats['tokens_created'] = len(tokens)
|
|
66
|
+
return tokens
|
|
67
|
+
|
|
68
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
69
|
+
"""Get tokenization statistics"""
|
|
70
|
+
return self.stats.copy()
|
|
71
|
+
|
|
72
|
+
def reset_stats(self) -> None:
|
|
73
|
+
"""Reset statistics"""
|
|
74
|
+
self.stats['tokens_created'] = 0
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Unicode normalization strategy - handles unicode characters and accents
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import unicodedata
|
|
6
|
+
from typing import Any, Dict
|
|
7
|
+
|
|
8
|
+
from .base_strategy import TextPreprocessingStrategy
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class UnicodeNormalizer(TextPreprocessingStrategy):
|
|
12
|
+
"""
|
|
13
|
+
Normalizes unicode characters, removes diacritics and combining marks
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(self, form: str = 'NFD'):
|
|
17
|
+
"""
|
|
18
|
+
Initialize normalizer
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
form: Unicode normalization form (NFD, NFC, NFKD, NFKC)
|
|
22
|
+
NFD is decomposed form (default)
|
|
23
|
+
NFC is composed form
|
|
24
|
+
"""
|
|
25
|
+
self.form = form
|
|
26
|
+
self.stats = {
|
|
27
|
+
'characters_removed': 0,
|
|
28
|
+
'form_used': form,
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
def process(self, text: str) -> str:
|
|
32
|
+
"""
|
|
33
|
+
Normalize unicode text
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
text: Text to normalize
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Normalized text
|
|
40
|
+
"""
|
|
41
|
+
# Normalize unicode (NFD - decomposed form removes accents)
|
|
42
|
+
normalized = unicodedata.normalize(self.form, text)
|
|
43
|
+
|
|
44
|
+
# Remove combining marks (accents, diacritics)
|
|
45
|
+
cleaned = ''.join(
|
|
46
|
+
ch for ch in normalized
|
|
47
|
+
if unicodedata.category(ch) != 'Mn'
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
chars_removed = len(text) - len(cleaned)
|
|
51
|
+
self.stats['characters_removed'] += chars_removed
|
|
52
|
+
|
|
53
|
+
return cleaned
|
|
54
|
+
|
|
55
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
56
|
+
"""Get normalization statistics"""
|
|
57
|
+
return self.stats.copy()
|
|
58
|
+
|
|
59
|
+
def reset_stats(self) -> None:
|
|
60
|
+
"""Reset statistics"""
|
|
61
|
+
self.stats['characters_removed'] = 0
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gptmed
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.0
|
|
4
4
|
Summary: A lightweight GPT-based language model framework for training custom question-answering models on any domain
|
|
5
5
|
Author-email: Sanjog Sigdel <sigdelsanjog@gmail.com>, Sanjog Sigdel <sanjog.sigdel@ku.edu.np>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -51,6 +51,7 @@ Requires-Dist: librosa>=0.10.0; extra == "data-preparation"
|
|
|
51
51
|
Requires-Dist: soundfile>=0.12.0; extra == "data-preparation"
|
|
52
52
|
Requires-Dist: opencv-python>=4.5.0; extra == "data-preparation"
|
|
53
53
|
Requires-Dist: numpy>=1.24.0; extra == "data-preparation"
|
|
54
|
+
Requires-Dist: PyPDF2>=3.0.0; extra == "data-preparation"
|
|
54
55
|
Dynamic: license-file
|
|
55
56
|
|
|
56
57
|
# GptMed 🤖
|