gptmed 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,137 @@
1
+ """
2
+ Text statistics extraction strategy
3
+ """
4
+
5
+ import re
6
+ from typing import Any, Dict
7
+
8
+ from .base_strategy import TextPreprocessingStrategy
9
+
10
+
11
+ class TextStatistics(TextPreprocessingStrategy):
12
+ """
13
+ Analyzes and extracts statistics from text
14
+ """
15
+
16
+ def __init__(self):
17
+ """Initialize text statistics analyzer"""
18
+ self.stats = {
19
+ 'analyses_performed': 0,
20
+ }
21
+ self.last_analysis: Dict[str, Any] = {}
22
+
23
+ def process(self, text: str) -> str:
24
+ """
25
+ Analyze text and return original text unchanged
26
+ (statistics are stored in get_stats())
27
+
28
+ Args:
29
+ text: Text to analyze
30
+
31
+ Returns:
32
+ Original text unchanged
33
+ """
34
+ self.analyze(text)
35
+ return text
36
+
37
+ def analyze(self, text: str) -> Dict[str, Any]:
38
+ """
39
+ Extract comprehensive statistics from text
40
+
41
+ Args:
42
+ text: Text to analyze
43
+
44
+ Returns:
45
+ Dictionary of statistics
46
+ """
47
+ if not text:
48
+ self.last_analysis = self._empty_analysis()
49
+ return self.last_analysis
50
+
51
+ # Count basic metrics
52
+ word_count = len(text.split())
53
+ char_count = len(text)
54
+ char_count_no_spaces = len(text.replace(' ', ''))
55
+ line_count = len(text.split('\n'))
56
+
57
+ # Sentence count (approximate)
58
+ sentence_count = len([s for s in re.split(r'[.!?]+', text) if s.strip()])
59
+
60
+ # Average metrics
61
+ avg_word_length = char_count_no_spaces / word_count if word_count > 0 else 0
62
+ avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0
63
+
64
+ # Unique words
65
+ words = text.lower().split()
66
+ unique_words = len(set(words))
67
+
68
+ # Vocabulary richness
69
+ vocabulary_richness = unique_words / word_count if word_count > 0 else 0
70
+
71
+ # Punctuation counts
72
+ punctuation_count = len([c for c in text if c in '.,!?;:'])
73
+
74
+ # Number extraction
75
+ numbers = re.findall(r'\d+', text)
76
+ number_count = len(numbers)
77
+
78
+ # Capitalized words
79
+ capitalized_words = len([w for w in text.split() if w and w[0].isupper()])
80
+
81
+ self.last_analysis = {
82
+ 'word_count': word_count,
83
+ 'char_count': char_count,
84
+ 'char_count_no_spaces': char_count_no_spaces,
85
+ 'line_count': line_count,
86
+ 'sentence_count': sentence_count,
87
+ 'avg_word_length': round(avg_word_length, 2),
88
+ 'avg_sentence_length': round(avg_sentence_length, 2),
89
+ 'unique_words': unique_words,
90
+ 'vocabulary_richness': round(vocabulary_richness, 3),
91
+ 'punctuation_count': punctuation_count,
92
+ 'number_count': number_count,
93
+ 'capitalized_words': capitalized_words,
94
+ }
95
+
96
+ self.stats['analyses_performed'] += 1
97
+ return self.last_analysis
98
+
99
+ def _empty_analysis(self) -> Dict[str, Any]:
100
+ """Return empty analysis dictionary"""
101
+ return {
102
+ 'word_count': 0,
103
+ 'char_count': 0,
104
+ 'char_count_no_spaces': 0,
105
+ 'line_count': 0,
106
+ 'sentence_count': 0,
107
+ 'avg_word_length': 0.0,
108
+ 'avg_sentence_length': 0.0,
109
+ 'unique_words': 0,
110
+ 'vocabulary_richness': 0.0,
111
+ 'punctuation_count': 0,
112
+ 'number_count': 0,
113
+ 'capitalized_words': 0,
114
+ }
115
+
116
+ def get_text_stats(self, text: str) -> Dict[str, Any]:
117
+ """
118
+ Convenience method to analyze text and return stats immediately
119
+
120
+ Args:
121
+ text: Text to analyze
122
+
123
+ Returns:
124
+ Dictionary of statistics
125
+ """
126
+ return self.analyze(text)
127
+
128
+ def get_stats(self) -> Dict[str, Any]:
129
+ """Get analyzer statistics"""
130
+ result = self.stats.copy()
131
+ result['last_analysis'] = self.last_analysis.copy()
132
+ return result
133
+
134
+ def reset_stats(self) -> None:
135
+ """Reset statistics"""
136
+ self.stats = {'analyses_performed': 0}
137
+ self.last_analysis = {}
@@ -0,0 +1,376 @@
1
+ """
2
+ Parallel Tokenization Pipeline
3
+
4
+ Takes preprocessed JSONL and generates tokens for model training.
5
+ Supports parallel processing with ThreadPoolExecutor for efficient batch processing.
6
+
7
+ Supports multiple tokenization methods:
8
+ 1. Hugging Face Transformers (AutoTokenizer)
9
+ 2. SentencePiece
10
+ 3. Built-in Tokenizer from data_preparation module
11
+
12
+ Parallel Processing:
13
+ - Automatically scales based on dataset size
14
+ - ThreadPoolExecutor for I/O-bound tokenization
15
+ - Separate outputs for individual documents and merged files
16
+
17
+ Usage:
18
+ python3 tokenize_jsonl.py \
19
+ --input-file ./output/full_preprocessed.jsonl \
20
+ --method huggingface \
21
+ --model gpt2 \
22
+ --workers 4 \
23
+ --output-dir ./output/tokens
24
+ """
25
+
26
+ import sys
27
+ import json
28
+ import logging
29
+ from pathlib import Path
30
+ from typing import Dict, List, Optional, Any, Tuple
31
+ from dataclasses import dataclass, asdict
32
+ import time
33
+ from concurrent.futures import ThreadPoolExecutor, as_completed
34
+ import os
35
+
36
+ # Adjust path
37
+ sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
38
+
39
+ from gptmed.data_preparation.text import Tokenizer as CustomTokenizer
40
+
41
+
42
+ # Configure logging
43
+ logging.basicConfig(
44
+ level=logging.INFO,
45
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
46
+ )
47
+ logger = logging.getLogger(__name__)
48
+
49
+
50
+ @dataclass
51
+ class SimplifiedTokenizedRecord:
52
+ """Simplified tokenized record - only final tokens"""
53
+ filename: str
54
+ tokens: List[int] # Token IDs
55
+ token_count: int
56
+ tokenizer_method: str
57
+ status: str
58
+
59
+
60
+ class ParallelJSONLTokenizer:
61
+ """Parallel tokenization for preprocessed JSONL files"""
62
+
63
+ def __init__(
64
+ self,
65
+ input_file: str,
66
+ output_dir: str = None,
67
+ method: str = 'huggingface',
68
+ model_name: Optional[str] = None,
69
+ workers: Optional[int] = None,
70
+ ):
71
+ """
72
+ Initialize parallel tokenizer
73
+
74
+ Args:
75
+ input_file: Input preprocessed JSONL file
76
+ output_dir: Output directory for tokens
77
+ method: Tokenization method (custom/huggingface/sentencepiece)
78
+ model_name: Model name for huggingface/sentencepiece
79
+ workers: Number of worker threads (auto-detect if None)
80
+ """
81
+ self.input_file = Path(input_file)
82
+
83
+ if output_dir is None:
84
+ output_dir = self.input_file.parent / 'tokens'
85
+
86
+ self.output_dir = Path(output_dir)
87
+ self.output_dir.mkdir(parents=True, exist_ok=True)
88
+
89
+ self.method = method
90
+ self.model_name = model_name
91
+
92
+ # Auto-detect workers based on CPU count
93
+ if workers is None:
94
+ workers = max(2, min(4, os.cpu_count() or 2))
95
+ self.workers = workers
96
+
97
+ self.logger = logging.getLogger(self.__class__.__name__)
98
+
99
+ # Initialize tokenizer based on method
100
+ self._initialize_tokenizer()
101
+
102
+ def _initialize_tokenizer(self):
103
+ """Initialize tokenizer based on method"""
104
+ if self.method == 'custom':
105
+ self.tokenizer = CustomTokenizer(mode='word')
106
+ self.logger.info("Initialized custom tokenizer (word mode)")
107
+
108
+ elif self.method == 'huggingface':
109
+ try:
110
+ from transformers import AutoTokenizer
111
+ if self.model_name is None:
112
+ self.model_name = 'gpt2'
113
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
114
+ self.logger.info(f"Initialized Hugging Face tokenizer ({self.model_name})")
115
+ except ImportError:
116
+ raise ImportError("Install transformers: pip install transformers")
117
+
118
+ elif self.method == 'sentencepiece':
119
+ try:
120
+ import sentencepiece as spm
121
+ if self.model_name is None:
122
+ raise ValueError("SentencePiece requires model_name parameter")
123
+ self.tokenizer = spm.SentencePieceProcessor(model_file=self.model_name)
124
+ self.logger.info(f"Initialized SentencePiece tokenizer ({self.model_name})")
125
+ except ImportError:
126
+ raise ImportError("Install sentencepiece: pip install sentencepiece")
127
+
128
+ else:
129
+ raise ValueError(f"Unknown tokenization method: {self.method}")
130
+
131
+ def load_jsonl(self) -> List[Dict[str, Any]]:
132
+ """Load preprocessed JSONL"""
133
+ if not self.input_file.exists():
134
+ raise FileNotFoundError(f"Input file not found: {self.input_file}")
135
+
136
+ records = []
137
+ with open(self.input_file, 'r', encoding='utf-8') as f:
138
+ for line_num, line in enumerate(f, 1):
139
+ try:
140
+ record = json.loads(line.strip())
141
+ records.append(record)
142
+ except json.JSONDecodeError as e:
143
+ self.logger.warning(f"Line {line_num}: Invalid JSON - {str(e)}")
144
+
145
+ self.logger.info(f"Loaded {len(records)} records from {self.input_file.name}")
146
+ return records
147
+
148
+ def _tokenize_with_custom(self, text: str) -> List[str]:
149
+ """Tokenize using custom tokenizer"""
150
+ return self.tokenizer.tokenize(text)
151
+
152
+ def _tokenize_with_huggingface(self, text: str) -> List[int]:
153
+ """Tokenize using Hugging Face tokenizer - returns token IDs"""
154
+ return self.tokenizer.encode(text, add_special_tokens=False)
155
+
156
+ def _tokenize_with_sentencepiece(self, text: str) -> List[int]:
157
+ """Tokenize using SentencePiece - returns token IDs"""
158
+ return self.tokenizer.encode(text)
159
+
160
+ def tokenize_record(self, record: Dict[str, Any], index: int = 0) -> Tuple[SimplifiedTokenizedRecord, str]:
161
+ """
162
+ Tokenize a single record
163
+
164
+ Returns:
165
+ Tuple of (TokenizedRecord, filename_clean)
166
+ """
167
+ try:
168
+ filename = record.get('filename', f'document_{index}')
169
+ text = record.get('text', '')
170
+
171
+ if not text or not isinstance(text, str) or len(text.strip()) < 10:
172
+ return (
173
+ SimplifiedTokenizedRecord(
174
+ filename=filename,
175
+ tokens=[],
176
+ token_count=0,
177
+ tokenizer_method=self.method,
178
+ status='empty_or_invalid_text'
179
+ ),
180
+ filename
181
+ )
182
+
183
+ # Tokenize based on method
184
+ if self.method == 'custom':
185
+ tokens = self._tokenize_with_custom(text)
186
+ # Convert to integers if needed
187
+ if tokens and isinstance(tokens[0], str):
188
+ tokens = list(range(len(tokens))) # Fallback
189
+ elif self.method == 'huggingface':
190
+ tokens = self._tokenize_with_huggingface(text)
191
+ elif self.method == 'sentencepiece':
192
+ tokens = self._tokenize_with_sentencepiece(text)
193
+ else:
194
+ tokens = []
195
+
196
+ return (
197
+ SimplifiedTokenizedRecord(
198
+ filename=filename,
199
+ tokens=tokens,
200
+ token_count=len(tokens),
201
+ tokenizer_method=self.method,
202
+ status='success'
203
+ ),
204
+ filename
205
+ )
206
+
207
+ except Exception as e:
208
+ self.logger.error(f"Error tokenizing {record.get('filename', 'unknown')}: {str(e)}")
209
+ return (
210
+ SimplifiedTokenizedRecord(
211
+ filename=record.get('filename', f'document_{index}'),
212
+ tokens=[],
213
+ token_count=0,
214
+ tokenizer_method=self.method,
215
+ status=f'error: {str(e)}'
216
+ ),
217
+ record.get('filename', f'document_{index}')
218
+ )
219
+
220
+ def save_merged_jsonl(self, records: List[SimplifiedTokenizedRecord]) -> bool:
221
+ """Save all tokenized records to merged file"""
222
+ try:
223
+ output_file = self.output_dir / 'merged_tokens.jsonl'
224
+
225
+ with open(output_file, 'w', encoding='utf-8') as f:
226
+ for record in records:
227
+ json.dump(asdict(record), f, ensure_ascii=False)
228
+ f.write('\n')
229
+
230
+ self.logger.info(f"✓ Saved {len(records)} merged tokens to {output_file.name}")
231
+ return True
232
+ except Exception as e:
233
+ self.logger.error(f"Error saving merged JSONL: {str(e)}")
234
+ return False
235
+
236
+ def process(self) -> Dict[str, Any]:
237
+ """Main parallel tokenization pipeline"""
238
+ start_time = time.time()
239
+
240
+ self.logger.info("="*70)
241
+ self.logger.info("Parallel Tokenization Pipeline")
242
+ self.logger.info("="*70)
243
+ self.logger.info(f"Input file: {self.input_file}")
244
+ self.logger.info(f"Output directory: {self.output_dir}")
245
+ self.logger.info(f"Method: {self.method}")
246
+ self.logger.info(f"Workers: {self.workers}")
247
+ if self.model_name:
248
+ self.logger.info(f"Model: {self.model_name}")
249
+ self.logger.info("="*70)
250
+
251
+ # Load records
252
+ records = self.load_jsonl()
253
+
254
+ if not records:
255
+ self.logger.warning("No records to tokenize")
256
+ return {'status': 'failure', 'total_records': 0}
257
+
258
+ # Parallel tokenization
259
+ self.logger.info(f"\nTokenizing {len(records)} records with {self.workers} workers...")
260
+ tokenized_records = []
261
+
262
+ with ThreadPoolExecutor(max_workers=self.workers) as executor:
263
+ # Submit all tasks
264
+ futures = {
265
+ executor.submit(self.tokenize_record, record, idx): idx
266
+ for idx, record in enumerate(records)
267
+ }
268
+
269
+ # Collect results as they complete
270
+ completed = 0
271
+ for future in as_completed(futures):
272
+ try:
273
+ tokenized, filename = future.result()
274
+ tokenized_records.append(tokenized)
275
+
276
+ completed += 1
277
+ if completed % max(1, len(records) // 10) == 0:
278
+ self.logger.info(f"Progress: {completed}/{len(records)} records tokenized")
279
+
280
+ except Exception as e:
281
+ self.logger.error(f"Error processing record: {str(e)}")
282
+
283
+ # Save merged file
284
+ self.logger.info(f"\nSaving merged tokenized records...")
285
+ saved = self.save_merged_jsonl(tokenized_records)
286
+
287
+ # Calculate statistics
288
+ successful = [r for r in tokenized_records if r.status == 'success']
289
+ failed = [r for r in tokenized_records if r.status != 'success']
290
+
291
+ total_time = time.time() - start_time
292
+
293
+ # Statistics
294
+ total_tokens = sum(r.token_count for r in successful)
295
+ avg_tokens_per_file = total_tokens / len(successful) if successful else 0
296
+
297
+ # Print summary
298
+ self.logger.info(f"\n" + "="*70)
299
+ self.logger.info(f"Tokenization Summary")
300
+ self.logger.info(f"="*70)
301
+ self.logger.info(f"Total records: {len(records)}")
302
+ self.logger.info(f"Successfully tokenized: {len(successful)}")
303
+ self.logger.info(f"Failed: {len(failed)}")
304
+ self.logger.info(f"\nToken Statistics:")
305
+ self.logger.info(f" Total tokens: {total_tokens:,}")
306
+ self.logger.info(f" Average tokens per file: {avg_tokens_per_file:.1f}")
307
+ self.logger.info(f"\nOutput File:")
308
+ self.logger.info(f" {self.output_dir}/merged_tokens.jsonl")
309
+ self.logger.info(f"\nTotal time: {total_time:.2f}s")
310
+ self.logger.info(f"Throughput: {len(records)/total_time:.2f} records/sec")
311
+ self.logger.info(f"="*70)
312
+
313
+ return {
314
+ 'status': 'success' if saved else 'failure',
315
+ 'input_file': str(self.input_file),
316
+ 'output_dir': str(self.output_dir),
317
+ 'total_records': len(records),
318
+ 'successful': len(successful),
319
+ 'failed': len(failed),
320
+ 'total_tokens': total_tokens,
321
+ 'average_tokens_per_record': avg_tokens_per_file,
322
+ 'tokenizer_method': self.method,
323
+ 'workers_used': self.workers,
324
+ 'total_time': total_time,
325
+ 'throughput': len(records) / total_time,
326
+ }
327
+
328
+
329
+
330
+
331
+ if __name__ == '__main__':
332
+ import argparse
333
+
334
+ parser = argparse.ArgumentParser(
335
+ description='Parallel tokenize preprocessed JSONL files'
336
+ )
337
+ parser.add_argument(
338
+ '--input-file',
339
+ default='./output/full_preprocessed.jsonl',
340
+ help='Input preprocessed JSONL file'
341
+ )
342
+ parser.add_argument(
343
+ '--output-dir',
344
+ default=None,
345
+ help='Output directory for tokens (auto-generated if not specified)'
346
+ )
347
+ parser.add_argument(
348
+ '--method',
349
+ default='huggingface',
350
+ choices=['custom', 'huggingface', 'sentencepiece'],
351
+ help='Tokenization method (default: huggingface)'
352
+ )
353
+ parser.add_argument(
354
+ '--model',
355
+ default='gpt2',
356
+ help='Model name for huggingface/sentencepiece (default: gpt2)'
357
+ )
358
+ parser.add_argument(
359
+ '--workers',
360
+ type=int,
361
+ default=None,
362
+ help='Number of worker threads (auto-detect if not specified)'
363
+ )
364
+
365
+ args = parser.parse_args()
366
+
367
+ tokenizer = ParallelJSONLTokenizer(
368
+ input_file=args.input_file,
369
+ output_dir=args.output_dir,
370
+ method=args.method,
371
+ model_name=args.model,
372
+ workers=args.workers,
373
+ )
374
+
375
+ result = tokenizer.process()
376
+ sys.exit(0 if result['status'] == 'success' else 1)
@@ -0,0 +1,74 @@
1
+ """
2
+ Tokenization strategy - converts text to tokens
3
+ """
4
+
5
+ import re
6
+ from typing import Any, Dict, List
7
+
8
+ from .base_strategy import TextPreprocessingStrategy
9
+
10
+
11
+ class Tokenizer(TextPreprocessingStrategy):
12
+ """
13
+ Tokenizes text into words or sentences
14
+ """
15
+
16
+ def __init__(self, mode: str = 'word'):
17
+ """
18
+ Initialize tokenizer
19
+
20
+ Args:
21
+ mode: Tokenization mode ('word' or 'sentence')
22
+ """
23
+ if mode not in ('word', 'sentence'):
24
+ raise ValueError(f"Invalid mode: {mode}. Use 'word' or 'sentence'")
25
+
26
+ self.mode = mode
27
+ self.stats = {
28
+ 'mode': mode,
29
+ 'tokens_created': 0,
30
+ }
31
+ self.last_tokens: List[str] = []
32
+
33
+ def process(self, text: str) -> str:
34
+ """
35
+ Tokenize text and return as space-separated tokens
36
+
37
+ Args:
38
+ text: Text to tokenize
39
+
40
+ Returns:
41
+ Space-separated tokens
42
+ """
43
+ tokens = self.tokenize(text)
44
+ self.stats['tokens_created'] = len(tokens)
45
+ return ' '.join(tokens)
46
+
47
+ def tokenize(self, text: str) -> List[str]:
48
+ """
49
+ Tokenize text into list of tokens
50
+
51
+ Args:
52
+ text: Text to tokenize
53
+
54
+ Returns:
55
+ List of tokens
56
+ """
57
+ if self.mode == 'word':
58
+ tokens = text.split()
59
+ elif self.mode == 'sentence':
60
+ # Split on sentence-ending punctuation
61
+ tokens = re.split(r'[.!?]+', text)
62
+ tokens = [s.strip() for s in tokens if s.strip()]
63
+
64
+ self.last_tokens = tokens
65
+ self.stats['tokens_created'] = len(tokens)
66
+ return tokens
67
+
68
+ def get_stats(self) -> Dict[str, Any]:
69
+ """Get tokenization statistics"""
70
+ return self.stats.copy()
71
+
72
+ def reset_stats(self) -> None:
73
+ """Reset statistics"""
74
+ self.stats['tokens_created'] = 0
@@ -0,0 +1,61 @@
1
+ """
2
+ Unicode normalization strategy - handles unicode characters and accents
3
+ """
4
+
5
+ import unicodedata
6
+ from typing import Any, Dict
7
+
8
+ from .base_strategy import TextPreprocessingStrategy
9
+
10
+
11
+ class UnicodeNormalizer(TextPreprocessingStrategy):
12
+ """
13
+ Normalizes unicode characters, removes diacritics and combining marks
14
+ """
15
+
16
+ def __init__(self, form: str = 'NFD'):
17
+ """
18
+ Initialize normalizer
19
+
20
+ Args:
21
+ form: Unicode normalization form (NFD, NFC, NFKD, NFKC)
22
+ NFD is decomposed form (default)
23
+ NFC is composed form
24
+ """
25
+ self.form = form
26
+ self.stats = {
27
+ 'characters_removed': 0,
28
+ 'form_used': form,
29
+ }
30
+
31
+ def process(self, text: str) -> str:
32
+ """
33
+ Normalize unicode text
34
+
35
+ Args:
36
+ text: Text to normalize
37
+
38
+ Returns:
39
+ Normalized text
40
+ """
41
+ # Normalize unicode (NFD - decomposed form removes accents)
42
+ normalized = unicodedata.normalize(self.form, text)
43
+
44
+ # Remove combining marks (accents, diacritics)
45
+ cleaned = ''.join(
46
+ ch for ch in normalized
47
+ if unicodedata.category(ch) != 'Mn'
48
+ )
49
+
50
+ chars_removed = len(text) - len(cleaned)
51
+ self.stats['characters_removed'] += chars_removed
52
+
53
+ return cleaned
54
+
55
+ def get_stats(self) -> Dict[str, Any]:
56
+ """Get normalization statistics"""
57
+ return self.stats.copy()
58
+
59
+ def reset_stats(self) -> None:
60
+ """Reset statistics"""
61
+ self.stats['characters_removed'] = 0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gptmed
3
- Version: 0.6.0
3
+ Version: 0.7.0
4
4
  Summary: A lightweight GPT-based language model framework for training custom question-answering models on any domain
5
5
  Author-email: Sanjog Sigdel <sigdelsanjog@gmail.com>, Sanjog Sigdel <sanjog.sigdel@ku.edu.np>
6
6
  License-Expression: MIT
@@ -51,6 +51,7 @@ Requires-Dist: librosa>=0.10.0; extra == "data-preparation"
51
51
  Requires-Dist: soundfile>=0.12.0; extra == "data-preparation"
52
52
  Requires-Dist: opencv-python>=4.5.0; extra == "data-preparation"
53
53
  Requires-Dist: numpy>=1.24.0; extra == "data-preparation"
54
+ Requires-Dist: PyPDF2>=3.0.0; extra == "data-preparation"
54
55
  Dynamic: license-file
55
56
 
56
57
  # GptMed 🤖