cv-parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CvParser
4
+ class Error < StandardError; end
5
+ class ConfigurationError < Error; end
6
+ class UnsupportedFormat < Error; end
7
+ class ParseError < Error; end
8
+ class APIError < Error; end
9
+ class RateLimitError < APIError; end
10
+ class AuthenticationError < APIError; end
11
+ class InvalidRequestError < APIError; end
12
+ class FileNotFoundError < Error; end
13
+ class FileNotReadableError < Error; end
14
+ end
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CvParser
4
+ class Extractor
5
+ def initialize(config = CvParser.configuration)
6
+ @config = config
7
+ validate_config!
8
+ @provider = build_provider
9
+ end
10
+
11
+ def extract(file_path:, output_schema: nil)
12
+ # Validate the file exists and is readable
13
+ validate_file!(file_path)
14
+
15
+ # Use provided output_schema or fall back to configuration
16
+ schema = output_schema || @config.output_schema
17
+
18
+ # Validate schema exists
19
+ raise ConfigurationError, "Output schema not configured" if schema.nil?
20
+
21
+ # Send file directly to LLM provider for extraction
22
+ @provider.extract_data(
23
+ file_path: file_path,
24
+ output_schema: schema
25
+ )
26
+ end
27
+
28
+ private
29
+
30
+ def validate_config!
31
+ raise ConfigurationError, "LLM provider not configured" if @config.provider.nil?
32
+
33
+ # Skip API key validation for faker provider
34
+ return if @config.provider == :faker
35
+ return unless @config.api_key.nil? || @config.api_key.empty?
36
+
37
+ raise ConfigurationError, "API key not configured"
38
+ end
39
+
40
+ def build_provider
41
+ case @config.provider
42
+ when :openai
43
+ Providers::OpenAI.new(@config)
44
+ when :anthropic
45
+ Providers::Anthropic.new(@config)
46
+ when :faker
47
+ Providers::Faker.new(@config)
48
+ else
49
+ raise ConfigurationError, "Unsupported provider: #{@config.provider}"
50
+ end
51
+ end
52
+
53
+ def validate_file!(file_path)
54
+ raise FileNotFoundError, "File not found: #{file_path}" unless File.exist?(file_path)
55
+ raise FileNotReadableError, "File not readable: #{file_path}" unless File.readable?(file_path)
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,495 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "zlib"
4
+ require "rexml/document"
5
+ require "rexml/xpath"
6
+
7
+ module CvParser
8
+ class PdfConverter
9
+ # Constants modules for better organization
10
+ module PageConstants
11
+ PAGE_WIDTH = 612 # 8.5in × 72dpi
12
+ PAGE_HEIGHT = 792 # 11in × 72dpi
13
+ LEFT_MARGIN = 50
14
+ RIGHT_MARGIN = 50
15
+ TOP_MARGIN = 770 # starting Y position (points)
16
+ BOTTOM_MARGIN = 50
17
+ end
18
+
19
+ module TextConstants
20
+ FONT_SIZE = 12
21
+ LINE_HEIGHT = (FONT_SIZE * 1.2).to_f
22
+ CHAR_WIDTH_RATIO = 0.5 # average char width ≈ FONT_SIZE * 0.5
23
+ end
24
+
25
+ include PageConstants
26
+ include TextConstants
27
+
28
+ # Convert a .docx file into a multi-page PDF.
29
+ #
30
+ # @param input_path [String] path to the .docx file
31
+ # @param output_path [String] path where the PDF should be written
32
+ # @raise [ArgumentError] if input_path is missing or not .docx
33
+ # @raise [RuntimeError] if extraction or PDF writing fails
34
+ def convert(input_path, output_path)
35
+ InputValidator.validate!(input_path)
36
+
37
+ xml = DocxExtractor.new(input_path).extract_document_xml
38
+ paragraphs = XmlParser.new(xml).parse_paragraphs
39
+
40
+ text_processor = TextProcessor.new
41
+ lines = text_processor.process_paragraphs(paragraphs)
42
+ pages = text_processor.paginate_lines(lines)
43
+
44
+ pdf_data = PdfBuilder.new.build_pdf(pages)
45
+ FileWriter.write_pdf(output_path, pdf_data)
46
+
47
+ output_path
48
+ end
49
+
50
+ private
51
+
52
+ # Backward compatibility methods for existing tests
53
+ def max_chars_per_line
54
+ TextProcessor.new.send(:max_chars_per_line)
55
+ end
56
+
57
+ def wrap_line(text, max_chars)
58
+ LineWrapper.new.wrap_line(text, max_chars)
59
+ end
60
+
61
+ def lines_per_page
62
+ TextProcessor.new.send(:lines_per_page)
63
+ end
64
+
65
+ def escape_pdf_text(text)
66
+ PdfTextEscaper.escape(text)
67
+ end
68
+
69
+ def validate_input!(input_path)
70
+ InputValidator.validate!(input_path)
71
+ end
72
+
73
+ def parse_paragraphs(xml_string)
74
+ XmlParser.new(xml_string).parse_paragraphs
75
+ end
76
+
77
+ def build_content_stream(lines)
78
+ ContentStreamBuilder.new(lines).build
79
+ end
80
+
81
+ def build_pdf(pages)
82
+ PdfBuilder.new.build_pdf(pages)
83
+ end
84
+
85
+ # Input validation extracted to separate class
86
+ class InputValidator
87
+ def self.validate!(input_path)
88
+ return if File.exist?(input_path) && File.extname(input_path).downcase == ".docx"
89
+
90
+ raise ArgumentError, "Input must be an existing .docx file"
91
+ end
92
+ end
93
+
94
+ # DOCX extraction logic extracted to separate class
95
+ class DocxExtractor
96
+ LOCAL_FILE_HEADER_SIG = 0x04034b50
97
+ DOCUMENT_XML_PATH = "word/document.xml"
98
+
99
+ def initialize(docx_path)
100
+ @docx_path = docx_path
101
+ end
102
+
103
+ def extract_document_xml
104
+ File.open(@docx_path, "rb") do |file|
105
+ scan_for_document_xml(file)
106
+ end
107
+ end
108
+
109
+ private
110
+
111
+ def scan_for_document_xml(file)
112
+ until file.eof?
113
+ sig_bytes = file.read(4)
114
+ break if sig_bytes.nil? || sig_bytes.bytesize < 4
115
+
116
+ sig = sig_bytes.unpack1("V")
117
+ if sig == LOCAL_FILE_HEADER_SIG
118
+ xml_content = process_zip_entry(file)
119
+ return xml_content if xml_content
120
+ else
121
+ # Not a local file header; back up 3 bytes to resync
122
+ file.seek(-3, IO::SEEK_CUR)
123
+ end
124
+ end
125
+
126
+ raise "#{DOCUMENT_XML_PATH} not found in DOCX"
127
+ end
128
+
129
+ def process_zip_entry(file)
130
+ header = file.read(26)
131
+ entry_info = parse_zip_header(header)
132
+
133
+ name_bytes = file.read(entry_info[:fname_len])
134
+ file.read(entry_info[:extra_len])
135
+ compressed_data = file.read(entry_info[:comp_size])
136
+
137
+ filename = name_bytes.force_encoding("UTF-8")
138
+ return nil unless filename == DOCUMENT_XML_PATH
139
+
140
+ decompress_data(compressed_data, entry_info[:compression])
141
+ end
142
+
143
+ def parse_zip_header(header)
144
+ (
145
+ version_needed,
146
+ flags,
147
+ compression,
148
+ _mod_time,
149
+ _mod_date,
150
+ _crc32,
151
+ comp_size,
152
+ _uncomp_size,
153
+ fname_len,
154
+ extra_len
155
+ ) = header.unpack("v v v v v V V V v v")
156
+
157
+ {
158
+ compression: compression,
159
+ comp_size: comp_size,
160
+ fname_len: fname_len,
161
+ extra_len: extra_len
162
+ }
163
+ end
164
+
165
+ def decompress_data(compressed_data, compression)
166
+ case compression
167
+ when 0
168
+ compressed_data.force_encoding("UTF-8")
169
+ when 8
170
+ Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(compressed_data)
171
+ else
172
+ raise "Unsupported compression method: #{compression}"
173
+ end
174
+ end
175
+ end
176
+
177
+ # XML parsing logic extracted to separate class
178
+ class XmlParser
179
+ def initialize(xml_string)
180
+ @xml_string = xml_string
181
+ end
182
+
183
+ def parse_paragraphs
184
+ doc = REXML::Document.new(@xml_string)
185
+ paragraphs = []
186
+
187
+ REXML::XPath.each(doc, '//*[local-name()="p"]') do |p_node|
188
+ paragraphs << extract_text_with_breaks(p_node).strip
189
+ end
190
+
191
+ paragraphs
192
+ end
193
+
194
+ private
195
+
196
+ def extract_text_with_breaks(node)
197
+ result = +""
198
+
199
+ node.children.each do |child|
200
+ case child.node_type
201
+ when :text
202
+ result << child.to_s
203
+ when :element
204
+ result << process_element_node(child)
205
+ end
206
+ end
207
+
208
+ result
209
+ end
210
+
211
+ def process_element_node(child)
212
+ return "" unless REXML::XPath.match(child, "/*").any?
213
+
214
+ if line_break?(child)
215
+ "\n"
216
+ else
217
+ extract_text_with_breaks(child)
218
+ end
219
+ end
220
+
221
+ def line_break?(element)
222
+ element.expanded_name.split(":").last == "br"
223
+ end
224
+ end
225
+
226
+ # Text processing and pagination logic
227
+ class TextProcessor
228
+ include PageConstants
229
+ include TextConstants
230
+
231
+ def process_paragraphs(paragraphs)
232
+ paragraphs.flat_map do |para_text|
233
+ process_single_paragraph(para_text)
234
+ end.flatten
235
+ end
236
+
237
+ def paginate_lines(lines)
238
+ lines.each_slice(lines_per_page).to_a
239
+ end
240
+
241
+ private
242
+
243
+ def process_single_paragraph(para_text)
244
+ sub_paragraphs = para_text.split("\n", -1)
245
+
246
+ sub_paragraphs.map do |line|
247
+ if line.strip.empty?
248
+ [""]
249
+ else
250
+ LineWrapper.new.wrap_line(line, max_chars_per_line)
251
+ end
252
+ end
253
+ end
254
+
255
+ def max_chars_per_line
256
+ usable_width = PAGE_WIDTH - LEFT_MARGIN - RIGHT_MARGIN
257
+ (usable_width / (FONT_SIZE * CHAR_WIDTH_RATIO)).floor
258
+ end
259
+
260
+ def lines_per_page
261
+ vertical_space = TOP_MARGIN - BOTTOM_MARGIN
262
+ (vertical_space / LINE_HEIGHT).floor + 1
263
+ end
264
+ end
265
+
266
+ # Line wrapping logic extracted to separate class
267
+ class LineWrapper
268
+ def wrap_line(text, max_chars)
269
+ return [text] if text.length <= max_chars
270
+
271
+ words = text.split(/\s+/)
272
+ lines = []
273
+ current_line = ""
274
+
275
+ words.each do |word|
276
+ if current_line.empty?
277
+ current_line = process_first_word_in_line(word, max_chars, lines)
278
+ elsif fits_on_current_line?(current_line, word, max_chars)
279
+ current_line << " " << word
280
+ else
281
+ lines << current_line
282
+ current_line = process_first_word_in_line(word, max_chars, lines)
283
+ end
284
+ end
285
+
286
+ lines << current_line unless current_line.empty?
287
+ lines
288
+ end
289
+
290
+ private
291
+
292
+ def process_first_word_in_line(word, max_chars, lines)
293
+ if word.length <= max_chars
294
+ word
295
+ else
296
+ break_long_word(word, max_chars, lines)
297
+ ""
298
+ end
299
+ end
300
+
301
+ def fits_on_current_line?(current_line, word, max_chars)
302
+ (current_line.length + 1 + word.length) <= max_chars
303
+ end
304
+
305
+ def break_long_word(word, max_chars, lines)
306
+ word.chars.each_slice(max_chars) { |segment| lines << segment.join }
307
+ end
308
+ end
309
+
310
+ # PDF building logic extracted to separate class
311
+ class PdfBuilder
312
+ include PageConstants
313
+ include TextConstants
314
+
315
+ def build_pdf(pages)
316
+ pdf_structure = PdfStructure.new(pages)
317
+ pdf_structure.build
318
+ end
319
+ end
320
+
321
+ # PDF structure and assembly
322
+ class PdfStructure
323
+ include PageConstants
324
+
325
+ PDF_VERSION = "%PDF-1.4"
326
+ PDF_HEADER_BYTES = "%\u00E2\u00E3\u00CF\u00D3"
327
+
328
+ def initialize(pages)
329
+ @pages = pages
330
+ @objects = []
331
+ @offsets = []
332
+ end
333
+
334
+ def build
335
+ create_catalog_object
336
+ create_pages_object
337
+ create_font_object
338
+ create_page_objects
339
+
340
+ assemble_pdf
341
+ end
342
+
343
+ private
344
+
345
+ def create_catalog_object
346
+ @objects << build_object(1, "<< /Type /Catalog /Pages 2 0 R >>")
347
+ end
348
+
349
+ def create_pages_object
350
+ page_ids = calculate_page_ids
351
+ kids_str = page_ids.map { |pid| "#{pid} 0 R" }.join(" ")
352
+ pages_content = "<< /Type /Pages /Count #{page_ids.size} /Kids [#{kids_str}] >>"
353
+
354
+ @objects << build_object(2, pages_content)
355
+ end
356
+
357
+ def create_font_object
358
+ @objects << build_object(3, "<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>")
359
+ end
360
+
361
+ def create_page_objects
362
+ page_ids = calculate_page_ids
363
+ content_ids = calculate_content_ids
364
+
365
+ @pages.each_with_index do |lines, idx|
366
+ create_page_object(page_ids[idx], content_ids[idx])
367
+ create_content_object(content_ids[idx], lines)
368
+ end
369
+ end
370
+
371
+ def create_page_object(page_id, content_id)
372
+ page_content = <<~CONTENT.strip
373
+ << /Type /Page /Parent 2 0 R
374
+ /MediaBox [0 0 #{PAGE_WIDTH} #{PAGE_HEIGHT}]
375
+ /Resources << /Font << /F1 3 0 R >> >>
376
+ /Contents #{content_id} 0 R
377
+ >>
378
+ CONTENT
379
+
380
+ @objects << build_object(page_id, page_content)
381
+ end
382
+
383
+ def create_content_object(content_id, lines)
384
+ content_stream = ContentStreamBuilder.new(lines).build
385
+ length = content_stream.bytesize
386
+
387
+ content_obj = <<~CONTENT
388
+ << /Length #{length} >>
389
+ stream
390
+ #{content_stream}endstream
391
+ CONTENT
392
+
393
+ @objects << build_object(content_id, content_obj)
394
+ end
395
+
396
+ def calculate_page_ids
397
+ @pages.map.with_index { |_, idx| 4 + (idx * 2) }
398
+ end
399
+
400
+ def calculate_content_ids
401
+ @pages.map.with_index { |_, idx| 5 + (idx * 2) }
402
+ end
403
+
404
+ def build_object(id, content)
405
+ <<~OBJ
406
+ #{id} 0 obj
407
+ #{content}
408
+ endobj
409
+ OBJ
410
+ end
411
+
412
+ def assemble_pdf
413
+ pdf = +"#{PDF_VERSION}\n#{PDF_HEADER_BYTES}\n"
414
+
415
+ @objects.each do |obj|
416
+ @offsets << pdf.bytesize
417
+ pdf << obj
418
+ end
419
+
420
+ add_xref_table(pdf)
421
+
422
+ pdf
423
+ end
424
+
425
+ def add_xref_table(pdf)
426
+ xref_offset = pdf.bytesize
427
+ pdf << "xref\n0 #{@objects.size + 1}\n"
428
+ pdf << "0000000000 65535 f \n"
429
+
430
+ @offsets.each do |offset|
431
+ pdf << "#{offset.to_s.rjust(10, "0")} 00000 n \n"
432
+ end
433
+
434
+ pdf << <<~TRAILER
435
+ trailer
436
+ << /Size #{@objects.size + 1} /Root 1 0 R >>
437
+ startxref
438
+ #{xref_offset}
439
+ %%EOF
440
+ TRAILER
441
+ end
442
+ end
443
+
444
+ # Content stream building for PDF pages
445
+ class ContentStreamBuilder
446
+ include PageConstants
447
+ include TextConstants
448
+
449
+ def initialize(lines)
450
+ @lines = lines
451
+ end
452
+
453
+ def build
454
+ return "" if @lines.empty?
455
+
456
+ stream = +"BT\n/F1 #{FONT_SIZE} Tf\n#{LEFT_MARGIN} #{TOP_MARGIN} Td\n"
457
+
458
+ @lines.each_with_index do |line, idx|
459
+ add_line_to_stream(stream, line, idx)
460
+ end
461
+
462
+ stream << "ET\n"
463
+ stream
464
+ end
465
+
466
+ private
467
+
468
+ def add_line_to_stream(stream, line, index)
469
+ if line.empty?
470
+ stream << "T*\n"
471
+ else
472
+ escaped_text = PdfTextEscaper.escape(line)
473
+ line_prefix = index.zero? ? "" : "T*\n"
474
+ stream << "#{line_prefix}(#{escaped_text}) Tj\n"
475
+ end
476
+ end
477
+ end
478
+
479
+ # PDF text escaping utility
480
+ class PdfTextEscaper
481
+ def self.escape(text)
482
+ text.gsub("\\", "\\\\").gsub("(", '\\(').gsub(")", '\\)')
483
+ end
484
+ end
485
+
486
+ # File writing utility
487
+ class FileWriter
488
+ def self.write_pdf(path, data)
489
+ dir = File.dirname(File.expand_path(path))
490
+ Dir.mkdir(dir) unless Dir.exist?(dir)
491
+ File.binwrite(path, data)
492
+ end
493
+ end
494
+ end
495
+ end