hindsight-api 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -16,6 +16,24 @@ from pydantic import BaseModel, Field, field_validator, ConfigDict
|
|
|
16
16
|
from ..llm_wrapper import OutputTooLongError, LLMConfig
|
|
17
17
|
|
|
18
18
|
|
|
19
|
+
def _sanitize_text(text: str) -> str:
|
|
20
|
+
"""
|
|
21
|
+
Sanitize text by removing invalid Unicode surrogate characters.
|
|
22
|
+
|
|
23
|
+
Surrogate characters (U+D800 to U+DFFF) are used in UTF-16 encoding
|
|
24
|
+
but cannot be encoded in UTF-8. They can appear in Python strings
|
|
25
|
+
from improperly decoded data (e.g., from JavaScript or broken files).
|
|
26
|
+
|
|
27
|
+
This function removes unpaired surrogates to prevent UnicodeEncodeError
|
|
28
|
+
when the text is sent to the LLM API.
|
|
29
|
+
"""
|
|
30
|
+
if not text:
|
|
31
|
+
return text
|
|
32
|
+
# Remove surrogate characters (U+D800 to U+DFFF) using regex
|
|
33
|
+
# These are invalid in UTF-8 and cause encoding errors
|
|
34
|
+
return re.sub(r'[\ud800-\udfff]', '', text)
|
|
35
|
+
|
|
36
|
+
|
|
19
37
|
class Entity(BaseModel):
|
|
20
38
|
"""An entity extracted from text."""
|
|
21
39
|
text: str = Field(
|
|
@@ -470,6 +488,10 @@ WHAT TO EXTRACT vs SKIP
|
|
|
470
488
|
max_retries = 2
|
|
471
489
|
last_error = None
|
|
472
490
|
|
|
491
|
+
# Sanitize input text to prevent Unicode encoding errors (e.g., unpaired surrogates)
|
|
492
|
+
sanitized_chunk = _sanitize_text(chunk)
|
|
493
|
+
sanitized_context = _sanitize_text(context) if context else 'none'
|
|
494
|
+
|
|
473
495
|
# Build user message with metadata and chunk content in a clear format
|
|
474
496
|
# Format event_date with day of week for better temporal reasoning
|
|
475
497
|
event_date_formatted = event_date.strftime('%A, %B %d, %Y') # e.g., "Monday, June 10, 2024"
|
|
@@ -477,10 +499,10 @@ WHAT TO EXTRACT vs SKIP
|
|
|
477
499
|
|
|
478
500
|
Chunk: {chunk_index + 1}/{total_chunks}
|
|
479
501
|
Event Date: {event_date_formatted} ({event_date.isoformat()})
|
|
480
|
-
Context: {
|
|
502
|
+
Context: {sanitized_context}
|
|
481
503
|
|
|
482
504
|
Text:
|
|
483
|
-
{
|
|
505
|
+
{sanitized_chunk}"""
|
|
484
506
|
|
|
485
507
|
for attempt in range(max_retries):
|
|
486
508
|
try:
|
|
@@ -24,7 +24,7 @@ hindsight_api/engine/retain/deduplication.py,sha256=9YXgVI_m1Mtz5Cv46ZceCEs0GwpL
|
|
|
24
24
|
hindsight_api/engine/retain/embedding_processing.py,sha256=cHTt3rPvDCWBWVPfSeg6bwH8HoXYGmP4bvS21boNONI,1734
|
|
25
25
|
hindsight_api/engine/retain/embedding_utils.py,sha256=Q24h_iw6pRAW2vDWPvauWY1o3bXLzW3eWvSxDALDiE0,1588
|
|
26
26
|
hindsight_api/engine/retain/entity_processing.py,sha256=meHOjsFzdvh1tbe6YlTofhcUs2Y6TcAN3S-0EKOvFP0,2705
|
|
27
|
-
hindsight_api/engine/retain/fact_extraction.py,sha256=
|
|
27
|
+
hindsight_api/engine/retain/fact_extraction.py,sha256=vOIlag9rJ8_8Q-TfOhMY88PeJpUyFIp0i7vdEyzbJLY,46125
|
|
28
28
|
hindsight_api/engine/retain/fact_storage.py,sha256=gRRQf_FCLsj5lUvdlOaxJsS5JosM6IhO_pik8Ur8VFg,5717
|
|
29
29
|
hindsight_api/engine/retain/link_creation.py,sha256=XJx7U3HboJLHtGgt_tHGsCa58lGo2ZyywzMNosrY9Xc,3154
|
|
30
30
|
hindsight_api/engine/retain/link_utils.py,sha256=PAXalIhAPZGcJv8EugcpwNgoWZ2D_ciVU3brHL-m090,26226
|
|
@@ -43,6 +43,6 @@ hindsight_api/engine/search/tracer.py,sha256=mcM9qZpj3YFudrBCESwc6YKNAiWIMx1lScX
|
|
|
43
43
|
hindsight_api/engine/search/types.py,sha256=qIeHW_gT7f291vteTZXygAM8oAaPp2dq6uEdvOyOwzs,5488
|
|
44
44
|
hindsight_api/web/__init__.py,sha256=WABqyqiAVFJJWOhKCytkj5Vcb61eAsRib3Ek7IMX6_U,378
|
|
45
45
|
hindsight_api/web/server.py,sha256=oPNJ_z4DO38MdK7Juyh2LdH0ipZ_BQF48cUM-4B_Uw0,5379
|
|
46
|
-
hindsight_api-0.0.
|
|
47
|
-
hindsight_api-0.0.
|
|
48
|
-
hindsight_api-0.0.
|
|
46
|
+
hindsight_api-0.0.15.dist-info/METADATA,sha256=bXvuwqAtqB2mCaDUCkiJ8t5tw085gMLWDLsGhFHF9Bo,1496
|
|
47
|
+
hindsight_api-0.0.15.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
48
|
+
hindsight_api-0.0.15.dist-info/RECORD,,
|
|
File without changes
|