gptmed 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,66 @@
1
+ """
2
+ Case normalization strategy - converts text case
3
+ """
4
+
5
+ from typing import Any, Dict
6
+
7
+ from .base_strategy import TextPreprocessingStrategy
8
+
9
+
10
+ class CaseNormalizer(TextPreprocessingStrategy):
11
+ """
12
+ Handles text case conversion (lowercase, uppercase, title case)
13
+ """
14
+
15
+ def __init__(self, mode: str = 'lower'):
16
+ """
17
+ Initialize case normalizer
18
+
19
+ Args:
20
+ mode: Case conversion mode ('lower', 'upper', 'title', 'sentence')
21
+ """
22
+ if mode not in ('lower', 'upper', 'title', 'sentence'):
23
+ raise ValueError(f"Invalid mode: {mode}. Use 'lower', 'upper', 'title', or 'sentence'")
24
+
25
+ self.mode = mode
26
+ self.stats = {
27
+ 'mode_used': mode,
28
+ 'conversions_applied': 0,
29
+ }
30
+
31
+ def process(self, text: str) -> str:
32
+ """
33
+ Normalize text case
34
+
35
+ Args:
36
+ text: Text to normalize
37
+
38
+ Returns:
39
+ Case-normalized text
40
+ """
41
+ if not text:
42
+ return text
43
+
44
+ original = text
45
+
46
+ if self.mode == 'lower':
47
+ text = text.lower()
48
+ elif self.mode == 'upper':
49
+ text = text.upper()
50
+ elif self.mode == 'title':
51
+ text = text.title()
52
+ elif self.mode == 'sentence':
53
+ text = text[0].upper() + text[1:].lower() if len(text) > 0 else text
54
+
55
+ if text != original:
56
+ self.stats['conversions_applied'] += 1
57
+
58
+ return text
59
+
60
+ def get_stats(self) -> Dict[str, Any]:
61
+ """Get conversion statistics"""
62
+ return self.stats.copy()
63
+
64
+ def reset_stats(self) -> None:
65
+ """Reset statistics"""
66
+ self.stats['conversions_applied'] = 0
@@ -0,0 +1,302 @@
1
+ """
2
+ PDF processing strategy - extracts text from PDFs and exports to CSV
3
+
4
+ Supports parallel processing for batch operations using concurrent.futures
5
+ for efficient handling of multiple PDF files.
6
+ """
7
+
8
+ import csv
9
+ import logging
10
+ from pathlib import Path
11
+ from typing import Any, Dict, List, Optional
12
+ from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
13
+
14
+ from .base_strategy import TextPreprocessingStrategy
15
+
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class PDFProcessor(TextPreprocessingStrategy):
21
+ """
22
+ Processes PDF files: extracts text and exports to CSV
23
+
24
+ Supports both sequential and parallel batch processing for efficient
25
+ handling of multiple PDF files using ThreadPoolExecutor.
26
+ """
27
+
28
+ def __init__(self, max_workers: int = 4, use_threading: bool = True):
29
+ """
30
+ Initialize PDF processor
31
+
32
+ Args:
33
+ max_workers: Maximum number of parallel workers (default: 4)
34
+ use_threading: Use ThreadPoolExecutor (True) or ProcessPoolExecutor (False)
35
+ Threading is preferred for I/O-bound PDF extraction
36
+ """
37
+ self.PyPDF2 = None
38
+ self._import_pdf_library()
39
+
40
+ self.max_workers = max_workers
41
+ self.use_threading = use_threading
42
+
43
+ self.stats = {
44
+ 'pdfs_processed': 0,
45
+ 'total_text_extracted': 0,
46
+ 'csv_exports': 0,
47
+ 'parallel_jobs_completed': 0,
48
+ 'processing_errors': 0,
49
+ }
50
+ self.last_extracted_texts: Dict[str, str] = {}
51
+
52
+ def _import_pdf_library(self) -> None:
53
+ """Import PyPDF2 library, raise error if not available"""
54
+ try:
55
+ import PyPDF2
56
+ self.PyPDF2 = PyPDF2
57
+ except ImportError:
58
+ raise ImportError(
59
+ "PyPDF2 is required for PDF processing. "
60
+ "Install it with: pip install gptmed[data-preparation]"
61
+ )
62
+
63
+ def process(self, text: str) -> str:
64
+ """
65
+ This strategy is not designed for streaming text processing.
66
+ Use extract_text_from_pdf() or batch_process_pdfs() instead.
67
+
68
+ Args:
69
+ text: Not used for PDF processing
70
+
71
+ Returns:
72
+ Empty string (PDFs are processed via dedicated methods)
73
+ """
74
+ return ""
75
+
76
+ def extract_text_from_pdf(self, pdf_path: str) -> str:
77
+ """
78
+ Extract text from a single PDF file
79
+
80
+ Args:
81
+ pdf_path: Path to PDF file
82
+
83
+ Returns:
84
+ Extracted text from PDF
85
+
86
+ Raises:
87
+ FileNotFoundError: If PDF file doesn't exist
88
+ Exception: If PDF processing fails
89
+ """
90
+ pdf_path = Path(pdf_path)
91
+
92
+ if not pdf_path.exists():
93
+ raise FileNotFoundError(f"PDF file not found: {pdf_path}")
94
+
95
+ try:
96
+ text_content = []
97
+ with open(pdf_path, 'rb') as file:
98
+ pdf_reader = self.PyPDF2.PdfReader(file)
99
+
100
+ for page_num in range(len(pdf_reader.pages)):
101
+ page = pdf_reader.pages[page_num]
102
+ text_content.append(page.extract_text())
103
+
104
+ full_text = '\n'.join(text_content)
105
+ self.last_extracted_texts[str(pdf_path)] = full_text
106
+ self.stats['pdfs_processed'] += 1
107
+ self.stats['total_text_extracted'] += len(full_text)
108
+
109
+ return full_text
110
+
111
+ except Exception as e:
112
+ raise Exception(f"Failed to process PDF {pdf_path}: {str(e)}")
113
+
114
+ def batch_process_pdfs(
115
+ self,
116
+ pdf_dir: str,
117
+ output_format: str = 'dict',
118
+ parallel: bool = True,
119
+ max_workers: Optional[int] = None,
120
+ ) -> Any:
121
+ """
122
+ Process multiple PDFs from a directory
123
+
124
+ Args:
125
+ pdf_dir: Directory containing PDF files
126
+ output_format: Format to return ('dict' or 'list')
127
+ parallel: Whether to use parallel processing (default: True)
128
+ max_workers: Override default max workers for this batch operation
129
+
130
+ Returns:
131
+ Dictionary or list of {filename: text} pairs
132
+ """
133
+ pdf_dir = Path(pdf_dir)
134
+
135
+ if not pdf_dir.is_dir():
136
+ raise NotADirectoryError(f"Directory not found: {pdf_dir}")
137
+
138
+ pdf_files = list(pdf_dir.glob('*.pdf'))
139
+
140
+ if not pdf_files:
141
+ logger.warning(f"No PDF files found in {pdf_dir}")
142
+ return {} if output_format == 'dict' else []
143
+
144
+ if parallel and len(pdf_files) > 1:
145
+ return self._batch_process_parallel(
146
+ pdf_files,
147
+ output_format,
148
+ max_workers or self.max_workers
149
+ )
150
+ else:
151
+ return self._batch_process_sequential(pdf_files, output_format)
152
+
153
+ def _batch_process_sequential(self, pdf_files: List[Path], output_format: str) -> Any:
154
+ """
155
+ Process PDFs sequentially (original method)
156
+
157
+ Args:
158
+ pdf_files: List of PDF file paths
159
+ output_format: Format to return ('dict' or 'list')
160
+
161
+ Returns:
162
+ Dictionary or list of {filename: text} pairs
163
+ """
164
+ results = {}
165
+
166
+ for pdf_file in pdf_files:
167
+ try:
168
+ text = self.extract_text_from_pdf(str(pdf_file))
169
+ results[pdf_file.name] = text
170
+ except Exception as e:
171
+ error_msg = f"Error: {str(e)}"
172
+ results[pdf_file.name] = error_msg
173
+ self.stats['processing_errors'] += 1
174
+ logger.error(f"Error processing {pdf_file.name}: {str(e)}")
175
+
176
+ if output_format == 'list':
177
+ return [{'filename': k, 'text': v} for k, v in results.items()]
178
+
179
+ return results
180
+
181
+ def _batch_process_parallel(
182
+ self,
183
+ pdf_files: List[Path],
184
+ output_format: str,
185
+ max_workers: int,
186
+ ) -> Any:
187
+ """
188
+ Process PDFs in parallel using ThreadPoolExecutor
189
+
190
+ Args:
191
+ pdf_files: List of PDF file paths
192
+ output_format: Format to return ('dict' or 'list')
193
+ max_workers: Maximum number of concurrent workers
194
+
195
+ Returns:
196
+ Dictionary or list of {filename: text} pairs
197
+ """
198
+ results = {}
199
+ executor_class = ThreadPoolExecutor if self.use_threading else ProcessPoolExecutor
200
+
201
+ logger.info(f"Starting parallel PDF processing with {max_workers} workers")
202
+
203
+ with executor_class(max_workers=max_workers) as executor:
204
+ # Submit all tasks
205
+ future_to_file = {
206
+ executor.submit(self.extract_text_from_pdf, str(pdf_file)): pdf_file
207
+ for pdf_file in pdf_files
208
+ }
209
+
210
+ # Process completed tasks as they finish
211
+ for i, future in enumerate(as_completed(future_to_file), 1):
212
+ pdf_file = future_to_file[future]
213
+
214
+ try:
215
+ text = future.result()
216
+ results[pdf_file.name] = text
217
+ self.stats['parallel_jobs_completed'] += 1
218
+ logger.debug(f"[{i}/{len(pdf_files)}] Completed: {pdf_file.name}")
219
+
220
+ except Exception as e:
221
+ error_msg = f"Error: {str(e)}"
222
+ results[pdf_file.name] = error_msg
223
+ self.stats['processing_errors'] += 1
224
+ logger.error(f"Error processing {pdf_file.name}: {str(e)}")
225
+
226
+ logger.info(f"Parallel processing complete: {len(results)} files processed")
227
+
228
+ if output_format == 'list':
229
+ return [{'filename': k, 'text': v} for k, v in results.items()]
230
+
231
+ return results
232
+
233
+ def export_to_csv(self, texts: Dict[str, str], output_file: str) -> None:
234
+ """
235
+ Export text data to CSV with filename and text content
236
+
237
+ Args:
238
+ texts: Dictionary of {filename: text_content}
239
+ output_file: Path to output CSV file
240
+ """
241
+ output_path = Path(output_file)
242
+ output_path.parent.mkdir(parents=True, exist_ok=True)
243
+
244
+ try:
245
+ with open(output_path, 'w', newline='', encoding='utf-8') as f:
246
+ writer = csv.writer(f)
247
+ writer.writerow(['filename', 'text_content'])
248
+
249
+ for filename, text in texts.items():
250
+ writer.writerow([filename, text])
251
+
252
+ self.stats['csv_exports'] += 1
253
+
254
+ except Exception as e:
255
+ raise Exception(f"Failed to export CSV: {str(e)}")
256
+
257
+ def export_to_csv_detailed(self, texts: Dict[str, str], output_file: str) -> None:
258
+ """
259
+ Export text data to CSV with detailed statistics
260
+
261
+ Args:
262
+ texts: Dictionary of {filename: text_content}
263
+ output_file: Path to output CSV file
264
+ """
265
+ output_path = Path(output_file)
266
+ output_path.parent.mkdir(parents=True, exist_ok=True)
267
+
268
+ try:
269
+ with open(output_path, 'w', newline='', encoding='utf-8') as f:
270
+ writer = csv.writer(f)
271
+ writer.writerow([
272
+ 'filename', 'text_content', 'word_count',
273
+ 'char_count', 'sentence_count'
274
+ ])
275
+
276
+ for filename, text in texts.items():
277
+ word_count = len(text.split())
278
+ char_count = len(text)
279
+ sentence_count = len([s for s in text.split('.') if s.strip()])
280
+
281
+ writer.writerow([
282
+ filename, text, word_count, char_count, sentence_count
283
+ ])
284
+
285
+ self.stats['csv_exports'] += 1
286
+
287
+ except Exception as e:
288
+ raise Exception(f"Failed to export detailed CSV: {str(e)}")
289
+
290
+ def get_stats(self) -> Dict[str, Any]:
291
+ """Get PDF processing statistics"""
292
+ return self.stats.copy()
293
+
294
+ def reset_stats(self) -> None:
295
+ """Reset statistics"""
296
+ self.stats = {
297
+ 'pdfs_processed': 0,
298
+ 'total_text_extracted': 0,
299
+ 'csv_exports': 0,
300
+ 'parallel_jobs_completed': 0,
301
+ 'processing_errors': 0,
302
+ }
@@ -0,0 +1,333 @@
1
+ """
2
+ Complete PDF → Tokens Pipeline
3
+
4
+ Orchestrates the full preprocessing pipeline:
5
+ 1. Extract text from PDFs (in-memory)
6
+ 2. Preprocess text (in-memory)
7
+ 3. Tokenize (saves merged_tokens.jsonl only)
8
+
9
+ This is the main entry point for generating training data.
10
+
11
+ Usage:
12
+ python3 pipeline.py \
13
+ --input-dir ./pdfs \
14
+ --output-dir ./output \
15
+ --tokenizer-method huggingface \
16
+ --tokenizer-model gpt2 \
17
+ --workers 4
18
+ """
19
+
20
+ import sys
21
+ import json
22
+ import logging
23
+ import time
24
+ from pathlib import Path
25
+ from typing import Dict, List, Any
26
+ from dataclasses import asdict
27
+ import argparse
28
+ import importlib.util
29
+
30
+ # Adjust path
31
+ sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
32
+
33
+ # Load batch_pdf_to_jsonl
34
+ spec1 = importlib.util.spec_from_file_location("batch_pdf_to_jsonl", Path(__file__).parent / "batch_pdf_to_jsonl.py")
35
+ batch_module = importlib.util.module_from_spec(spec1)
36
+ spec1.loader.exec_module(batch_module)
37
+ PDFBatchProcessor = batch_module.PDFBatchProcessor
38
+ PDFRecord = batch_module.PDFRecord
39
+
40
+ # Load preprocess_jsonl
41
+ spec2 = importlib.util.spec_from_file_location("preprocess_jsonl", Path(__file__).parent / "preprocess_jsonl.py")
42
+ preprocess_module = importlib.util.module_from_spec(spec2)
43
+ spec2.loader.exec_module(preprocess_module)
44
+ ComprehensiveJSONLPreprocessor = preprocess_module.ComprehensiveJSONLPreprocessor
45
+ FullPreprocessedRecord = preprocess_module.FullPreprocessedRecord
46
+
47
+ # Load tokenize_jsonl
48
+ spec3 = importlib.util.spec_from_file_location("tokenize_jsonl", Path(__file__).parent / "tokenize_jsonl.py")
49
+ tokenize_module = importlib.util.module_from_spec(spec3)
50
+ spec3.loader.exec_module(tokenize_module)
51
+ ParallelJSONLTokenizer = tokenize_module.ParallelJSONLTokenizer
52
+ SimplifiedTokenizedRecord = tokenize_module.SimplifiedTokenizedRecord
53
+
54
+
55
+ # Configure logging
56
+ logging.basicConfig(
57
+ level=logging.INFO,
58
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
59
+ )
60
+ logger = logging.getLogger(__name__)
61
+
62
+
63
+ class EndToEndPipeline:
64
+ """Complete PDF to training data pipeline"""
65
+
66
+ def __init__(
67
+ self,
68
+ input_dir: str = "./pdfs",
69
+ output_dir: str = "./output",
70
+ tokenizer_method: str = "huggingface",
71
+ tokenizer_model: str = "gpt2",
72
+ workers: int = 4,
73
+ case_mode: str = "lower",
74
+ remove_stopwords: bool = False,
75
+ remove_punctuation: bool = False,
76
+ ):
77
+ """
78
+ Initialize pipeline
79
+
80
+ Args:
81
+ input_dir: Directory containing PDFs
82
+ output_dir: Output directory for final results
83
+ tokenizer_method: Tokenization method (huggingface/custom/sentencepiece)
84
+ tokenizer_model: Tokenizer model name
85
+ workers: Number of parallel workers
86
+ case_mode: Case normalization (lower/upper/title/sentence)
87
+ remove_stopwords: Whether to remove stopwords
88
+ remove_punctuation: Whether to remove punctuation
89
+ """
90
+ self.input_dir = Path(input_dir)
91
+ self.output_dir = Path(output_dir)
92
+ self.tokenizer_method = tokenizer_method
93
+ self.tokenizer_model = tokenizer_model
94
+ self.workers = workers
95
+ self.case_mode = case_mode
96
+ self.remove_stopwords = remove_stopwords
97
+ self.remove_punctuation = remove_punctuation
98
+
99
+ # Create output directory
100
+ self.output_dir.mkdir(parents=True, exist_ok=True)
101
+
102
+ self.logger = logging.getLogger(self.__class__.__name__)
103
+
104
+ def step1_extract_pdfs(self) -> List[PDFRecord]:
105
+ """Step 1: Extract text from PDFs in-memory"""
106
+ self.logger.info("\n" + "="*70)
107
+ self.logger.info("STEP 1: PDF EXTRACTION")
108
+ self.logger.info("="*70)
109
+
110
+ processor = PDFBatchProcessor(
111
+ input_dir=str(self.input_dir),
112
+ output_dir=str(self.output_dir),
113
+ max_workers=self.workers,
114
+ )
115
+
116
+ result = processor.process()
117
+ records = result.get('records', [])
118
+
119
+ self.logger.info(f"\n✓ Extracted {len(records)} PDF records")
120
+ return records
121
+
122
+ def step2_preprocess_text(self, records: List[PDFRecord]) -> List[FullPreprocessedRecord]:
123
+ """Step 2: Preprocess text in-memory"""
124
+ self.logger.info("\n" + "="*70)
125
+ self.logger.info("STEP 2: TEXT PREPROCESSING")
126
+ self.logger.info("="*70)
127
+
128
+ # Create a dummy temp file just to initialize the preprocessor
129
+ # We'll manually preprocess records using its methods
130
+ temp_input = self.output_dir / "_temp_input.jsonl"
131
+ with open(temp_input, 'w') as f:
132
+ f.write("{}\n")
133
+
134
+ try:
135
+ processor = ComprehensiveJSONLPreprocessor(
136
+ input_file=str(temp_input),
137
+ output_file=None,
138
+ case_mode=self.case_mode,
139
+ remove_stopwords=self.remove_stopwords,
140
+ remove_punctuation=self.remove_punctuation,
141
+ )
142
+ finally:
143
+ temp_input.unlink()
144
+
145
+ preprocessed_records = []
146
+
147
+ # Convert PDFRecord to dict format expected by preprocess_record
148
+ for record in records:
149
+ record_dict = {
150
+ 'filename': record.filename,
151
+ 'text': record.text,
152
+ 'word_count': record.word_count,
153
+ }
154
+
155
+ # Preprocess the record
156
+ preprocessed = processor.preprocess_record(record_dict)
157
+ preprocessed_records.append(preprocessed)
158
+
159
+ self.logger.info(f"✓ Preprocessed {len(preprocessed_records)} records")
160
+
161
+ # Save preprocessed output
162
+ output_file = self.output_dir / "full_preprocessed.jsonl"
163
+ with open(output_file, 'w', encoding='utf-8') as f:
164
+ for record in preprocessed_records:
165
+ json.dump(asdict(record), f, ensure_ascii=False)
166
+ f.write('\n')
167
+
168
+ self.logger.info(f"✓ Saved: {output_file.name}")
169
+
170
+ return preprocessed_records
171
+
172
+ def step3_tokenize(self, preprocessed_records: List[FullPreprocessedRecord]) -> Dict[str, Any]:
173
+ """Step 3: Tokenize preprocessed text"""
174
+ self.logger.info("\n" + "="*70)
175
+ self.logger.info("STEP 3: TOKENIZATION")
176
+ self.logger.info("="*70)
177
+
178
+ # Save preprocessed records to temporary JSONL for tokenizer to consume
179
+ temp_file = self.output_dir / "_temp_preprocessed.jsonl"
180
+ with open(temp_file, 'w', encoding='utf-8') as f:
181
+ for record in preprocessed_records:
182
+ json.dump(asdict(record), f, ensure_ascii=False)
183
+ f.write('\n')
184
+
185
+ # Initialize tokenizer
186
+ tokens_dir = self.output_dir / "tokens"
187
+ tokenizer = ParallelJSONLTokenizer(
188
+ input_file=str(temp_file),
189
+ output_dir=str(tokens_dir),
190
+ method=self.tokenizer_method,
191
+ model_name=self.tokenizer_model,
192
+ workers=self.workers,
193
+ )
194
+
195
+ # Tokenize
196
+ result = tokenizer.process()
197
+
198
+ # Clean up temporary file
199
+ temp_file.unlink()
200
+
201
+ return result
202
+
203
+ def run(self) -> Dict[str, Any]:
204
+ """Execute full pipeline"""
205
+ start_time = time.time()
206
+
207
+ self.logger.info("\n")
208
+ self.logger.info("╔" + "="*68 + "╗")
209
+ self.logger.info("║" + " "*15 + "END-TO-END PDF → TOKENS PIPELINE" + " "*21 + "║")
210
+ self.logger.info("╚" + "="*68 + "╝")
211
+
212
+ try:
213
+ # Step 1: Extract PDFs
214
+ pdf_records = self.step1_extract_pdfs()
215
+
216
+ if not pdf_records:
217
+ self.logger.error("No PDF records extracted. Exiting.")
218
+ return {'status': 'failure', 'message': 'No PDFs extracted'}
219
+
220
+ # Step 2: Preprocess
221
+ preprocessed_records = self.step2_preprocess_text(pdf_records)
222
+
223
+ if not preprocessed_records:
224
+ self.logger.error("No records preprocessed. Exiting.")
225
+ return {'status': 'failure', 'message': 'Preprocessing failed'}
226
+
227
+ # Step 3: Tokenize
228
+ tokenization_result = self.step3_tokenize(preprocessed_records)
229
+
230
+ total_time = time.time() - start_time
231
+
232
+ # Final summary
233
+ self.logger.info("\n" + "="*70)
234
+ self.logger.info("PIPELINE COMPLETE")
235
+ self.logger.info("="*70)
236
+ self.logger.info(f"\nFinal Outputs:")
237
+ self.logger.info(f" 1. {self.output_dir}/full_preprocessed.jsonl (cleaned text)")
238
+ self.logger.info(f" 2. {self.output_dir}/tokens/merged_tokens.jsonl (training tokens)")
239
+ self.logger.info(f"\nTotal Time: {total_time:.2f}s")
240
+ self.logger.info(f"="*70 + "\n")
241
+
242
+ return {
243
+ 'status': 'success',
244
+ 'total_time': total_time,
245
+ 'pdf_extraction': {
246
+ 'records': len(pdf_records),
247
+ },
248
+ 'preprocessing': {
249
+ 'records': len(preprocessed_records),
250
+ },
251
+ 'tokenization': tokenization_result,
252
+ 'output_dir': str(self.output_dir),
253
+ }
254
+
255
+ except Exception as e:
256
+ self.logger.error(f"Pipeline failed: {str(e)}")
257
+ import traceback
258
+ traceback.print_exc()
259
+ return {'status': 'failure', 'error': str(e)}
260
+
261
+
262
+ def main():
263
+ """Main entry point"""
264
+ parser = argparse.ArgumentParser(
265
+ description='Complete PDF to training tokens pipeline'
266
+ )
267
+ parser.add_argument(
268
+ '--input-dir',
269
+ default='./pdfs',
270
+ help='Input directory containing PDFs (default: ./pdfs)'
271
+ )
272
+ parser.add_argument(
273
+ '--output-dir',
274
+ default='./output',
275
+ help='Output directory for final results (default: ./output)'
276
+ )
277
+ parser.add_argument(
278
+ '--tokenizer-method',
279
+ default='huggingface',
280
+ choices=['huggingface', 'custom', 'sentencepiece'],
281
+ help='Tokenization method (default: huggingface)'
282
+ )
283
+ parser.add_argument(
284
+ '--tokenizer-model',
285
+ default='gpt2',
286
+ help='Tokenizer model name (default: gpt2)'
287
+ )
288
+ parser.add_argument(
289
+ '--workers',
290
+ type=int,
291
+ default=4,
292
+ help='Number of parallel workers (default: 4)'
293
+ )
294
+ parser.add_argument(
295
+ '--case-mode',
296
+ default='lower',
297
+ choices=['lower', 'upper', 'title', 'sentence'],
298
+ help='Case normalization mode (default: lower)'
299
+ )
300
+ parser.add_argument(
301
+ '--remove-stopwords',
302
+ action='store_true',
303
+ help='Remove common stopwords during preprocessing'
304
+ )
305
+ parser.add_argument(
306
+ '--remove-punctuation',
307
+ action='store_true',
308
+ help='Remove punctuation during preprocessing'
309
+ )
310
+
311
+ args = parser.parse_args()
312
+
313
+ # Create and run pipeline
314
+ pipeline = EndToEndPipeline(
315
+ input_dir=args.input_dir,
316
+ output_dir=args.output_dir,
317
+ tokenizer_method=args.tokenizer_method,
318
+ tokenizer_model=args.tokenizer_model,
319
+ workers=args.workers,
320
+ case_mode=args.case_mode,
321
+ remove_stopwords=args.remove_stopwords,
322
+ remove_punctuation=args.remove_punctuation,
323
+ )
324
+
325
+ result = pipeline.run()
326
+
327
+ # Exit with appropriate code
328
+ return 0 if result['status'] == 'success' else 1
329
+
330
+
331
+ if __name__ == '__main__':
332
+ exit_code = main()
333
+ sys.exit(exit_code)