pyconverters-openai_vision 0.5.52__py3-none-any.whl → 0.5.54__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,2 +1,2 @@
1
1
  """OpenAIVision converter"""
2
- __version__ = "0.5.52"
2
+ __version__ = "0.5.54"
@@ -4,14 +4,14 @@ import re
4
4
  from enum import Enum
5
5
  from logging import Logger
6
6
  from re import Pattern
7
- from typing import List, cast, Type, Dict, Any, Optional
7
+ from typing import List, cast, Type, Dict, Any, Optional, Callable
8
8
 
9
9
  import filetype as filetype
10
10
  from log_with_context import add_logging_context
11
11
  from pydantic import Field, BaseModel
12
12
  from pymultirole_plugins.v1.converter import ConverterParameters, ConverterBase
13
13
  from pymultirole_plugins.v1.processor import ProcessorParameters, ProcessorBase
14
- from pymultirole_plugins.v1.schema import Document, AltText
14
+ from pymultirole_plugins.v1.schema import Document, AltText, Sentence
15
15
  from starlette.datastructures import UploadFile
16
16
 
17
17
  from .openai_utils import create_openai_model_enum, openai_chat_completion, gpt_filter, \
@@ -357,6 +357,73 @@ class OpenAIVisionProcessorBaseParameters(ProcessorParameters):
357
357
  )
358
358
 
359
359
 
360
+ def regex_sub_preserve_spans(
361
+ text: str,
362
+ regex: str,
363
+ repl: Callable[[re.Match], str],
364
+ spans: List[Sentence],
365
+ flags=0,
366
+ ):
367
+ new_text_parts = []
368
+ char_map = {} # old_char_offset -> new_char_offset
369
+
370
+ last_pos = 0
371
+ new_pos = 0
372
+
373
+ for match in re.finditer(regex, text, flags):
374
+ start, end = match.start(), match.end()
375
+ replacement = repl(match)
376
+
377
+ # Copier le texte inchangé
378
+ unchanged = text[last_pos:start]
379
+ new_text_parts.append(unchanged)
380
+
381
+ for i in range(last_pos, start):
382
+ char_map[i] = new_pos
383
+ new_pos += 1
384
+
385
+ # Insérer le remplacement
386
+ new_text_parts.append(replacement)
387
+
388
+ for i in range(start, end):
389
+ char_map[i] = new_pos
390
+
391
+ new_pos += len(replacement)
392
+ last_pos = end
393
+
394
+ # Reste du texte
395
+ tail = text[last_pos:]
396
+ new_text_parts.append(tail)
397
+
398
+ for i in range(last_pos, len(text)):
399
+ char_map[i] = new_pos
400
+ new_pos += 1
401
+
402
+ new_text = "".join(new_text_parts)
403
+
404
+ # Créer le nouveau Doc
405
+ # Recréer les spans
406
+ new_spans = None
407
+ if spans is not None:
408
+ new_spans = []
409
+ for span in spans:
410
+ if span.start not in char_map or span.end - 1 not in char_map:
411
+ continue
412
+
413
+ new_start = char_map[span.start]
414
+ new_end = char_map[span.end - 1] + 1
415
+
416
+ new_span = Sentence(
417
+ start=new_start,
418
+ end=new_end,
419
+ metadata=span.metadata)
420
+
421
+ if new_span is not None:
422
+ new_spans.append(new_span)
423
+
424
+ return new_text, new_spans
425
+
426
+
360
427
  class OpenAIVisionProcessorBase(ProcessorBase):
361
428
  __doc__ = """Generate text using [OpenAI Text Completion](https://platform.openai.com/docs/guides/completion) API
362
429
  You input some text as a prompt, and the model will generate a text completion that attempts to match whatever context or pattern you gave it."""
@@ -449,20 +516,27 @@ class OpenAIVisionProcessorBase(ProcessorBase):
449
516
 
450
517
  if params.replace_refs_altTexts_by_descriptions:
451
518
  text = document.text
452
- link_regex = r"!\[([^]]+)\]\(([^]]+)\)"
519
+ link_regex = r"!\[([^]]+)\]\(([^)]+)\)"
453
520
 
454
521
  def convert_links(matchobj):
455
522
  m = matchobj.group(0)
456
523
  m_id = matchobj.group(1)
457
524
  if m_id in alts:
458
525
  # markdown blockquote
459
- m_desc = "\n".join(["> " + li for li in alts[m_id].splitlines()])
526
+ # m_desc = "\n".join(["> " + li for li in alts[m_id].splitlines()])
527
+ m_desc = alts[m_id]
460
528
  return f"{m}\n{m_desc}\n"
461
529
  return m
462
530
 
463
- ptext = re.sub(link_regex, convert_links, text, 0,
464
- re.MULTILINE)
465
- document.text = ptext
531
+ new_text, new_sentences = regex_sub_preserve_spans(
532
+ text,
533
+ link_regex,
534
+ convert_links,
535
+ document.sentences,
536
+ flags=re.MULTILINE
537
+ )
538
+ document.text = new_text
539
+ document.sentences = new_sentences
466
540
  for altText in altTexts:
467
541
  if altText.name not in alts:
468
542
  document.altTexts.append(altText)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pyconverters-openai_vision
3
- Version: 0.5.52
3
+ Version: 0.5.54
4
4
  Summary: OpenAIVision converter
5
5
  Home-page: https://kairntech.com/
6
6
  Author: Olivier Terrier
@@ -0,0 +1,7 @@
1
+ pyconverters_openai_vision/__init__.py,sha256=7ClUk4wE7QRUGKj6xr8-2iGGhMXl4YKmtuY0IeYJq8I,52
2
+ pyconverters_openai_vision/openai_utils.py,sha256=XI4WYZ-EAVG0Vxd5yUDuZNDgEzqHJeriScxTUusi1oo,7740
3
+ pyconverters_openai_vision/openai_vision.py,sha256=d2qlPgD8vfMelZVH-6fvdXWns9nkVxCIAwx_UenOvRc,25862
4
+ pyconverters_openai_vision-0.5.54.dist-info/entry_points.txt,sha256=NR0re-yebKKyhApky1I6nDQzjJQfEyfOkJlJju0Ngzo,404
5
+ pyconverters_openai_vision-0.5.54.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
6
+ pyconverters_openai_vision-0.5.54.dist-info/METADATA,sha256=DpVk7gRhFPN0WYWyS1Pzhy-BKHVWtC2dWeLhRI70yX4,2662
7
+ pyconverters_openai_vision-0.5.54.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- pyconverters_openai_vision/__init__.py,sha256=g7HB-L3arbfi4y27TjX8fUur2DtWbhxcEmibUoByG5U,52
2
- pyconverters_openai_vision/openai_utils.py,sha256=XI4WYZ-EAVG0Vxd5yUDuZNDgEzqHJeriScxTUusi1oo,7740
3
- pyconverters_openai_vision/openai_vision.py,sha256=A5TRj0q-Ojzi4LcKxiP9qdTXGglR_v2YgGApmrJWEeE,23855
4
- pyconverters_openai_vision-0.5.52.dist-info/entry_points.txt,sha256=NR0re-yebKKyhApky1I6nDQzjJQfEyfOkJlJju0Ngzo,404
5
- pyconverters_openai_vision-0.5.52.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
6
- pyconverters_openai_vision-0.5.52.dist-info/METADATA,sha256=1BKH6j0kih-UTGREwtjmODwX_Mz0lfd8lHtqrInrHvM,2662
7
- pyconverters_openai_vision-0.5.52.dist-info/RECORD,,