hie-rag 0.2.2__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hie_rag/hie_rag.py +1 -1
- hie_rag/utils.py +12 -22
- {hie_rag-0.2.2.dist-info → hie_rag-0.2.3.dist-info}/METADATA +1 -1
- {hie_rag-0.2.2.dist-info → hie_rag-0.2.3.dist-info}/RECORD +7 -7
- {hie_rag-0.2.2.dist-info → hie_rag-0.2.3.dist-info}/WHEEL +0 -0
- {hie_rag-0.2.2.dist-info → hie_rag-0.2.3.dist-info}/licenses/LICENSE +0 -0
- {hie_rag-0.2.2.dist-info → hie_rag-0.2.3.dist-info}/top_level.txt +0 -0
hie_rag/hie_rag.py
CHANGED
@@ -16,7 +16,7 @@ class HieRag:
|
|
16
16
|
def process_and_save_index_stream(self, file_name: str, uploaded_file: bytes, min_chunk_size, max_chunk_size):
|
17
17
|
yield {"status": "🔍 Extracting text..."}
|
18
18
|
print(f"Extracting text from {file_name}")
|
19
|
-
extracted_text = self.utils.extract_text(uploaded_file)
|
19
|
+
extracted_text = self.utils.extract_text(uploaded_file, file_name=file_name)
|
20
20
|
|
21
21
|
yield {"status": "✂️ Splitting into chunks..."}
|
22
22
|
print(f"Splitting text into chunks with min size {min_chunk_size} and max size {max_chunk_size}")
|
hie_rag/utils.py
CHANGED
@@ -16,36 +16,26 @@ class Utils:
|
|
16
16
|
# self.client = OpenAI(api_key=api_key)
|
17
17
|
self.client = AiClient(base_url=base_url)
|
18
18
|
|
19
|
-
def extract_text(self,
|
19
|
+
def extract_text(self, uploaded_bytes: bytes, file_name: str):
|
20
20
|
"""Extract text from an uploaded file using MarkItDown."""
|
21
|
-
# md = MarkItDown(llm_client=self.client, llm_model="gpt-4o")
|
22
21
|
md = MarkItDown()
|
23
22
|
|
24
|
-
#
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
suffix = os.path.splitext(filename)[-1] if filename else ".bin"
|
32
|
-
else:
|
33
|
-
raise TypeError("Unsupported file type: must be bytes or file-like object")
|
34
|
-
|
35
|
-
# Write to temp file for MarkItDown to process
|
36
|
-
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
|
37
|
-
temp_file_path = temp_file.name
|
38
|
-
temp_file.write(file_bytes)
|
23
|
+
# derive a real suffix from the filename
|
24
|
+
suffix = os.path.splitext(file_name)[1].lower() or ".txt"
|
25
|
+
|
26
|
+
# write to temp file
|
27
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
28
|
+
tmp.write(uploaded_bytes)
|
29
|
+
tmp_path = tmp.name
|
39
30
|
|
40
31
|
try:
|
41
|
-
# Redirect stderr to suppress native print warnings like "CropBox missing"
|
42
32
|
with contextlib.redirect_stderr(io.StringIO()):
|
43
|
-
|
33
|
+
result = md.convert(tmp_path)
|
44
34
|
finally:
|
45
|
-
|
46
|
-
os.remove(temp_file_path)
|
35
|
+
os.remove(tmp_path)
|
47
36
|
|
48
|
-
return
|
37
|
+
# depending on MarkItDown version this may return a str or an object
|
38
|
+
return getattr(result, "text_content", result)
|
49
39
|
|
50
40
|
def count_tokens(self, text: str, encoding="cl100k_base") -> int:
|
51
41
|
"""Count tokens in text using tiktoken"""
|
@@ -1,14 +1,14 @@
|
|
1
1
|
hie_rag/__init__.py,sha256=p2glSTkCqGvMlcivcuKBStFh2C5adojaC9aGmF6nbhY,358
|
2
2
|
hie_rag/ai_client.py,sha256=VbGQ0e3vZNn8W2YoR15Vvq2r-MUs-TBRNLGiImT4QxU,1000
|
3
|
-
hie_rag/hie_rag.py,sha256=
|
3
|
+
hie_rag/hie_rag.py,sha256=6WG5TyX3w_2jm1lIWFgCn-J8lElPOmjqNfUKe7akAI0,2762
|
4
4
|
hie_rag/process.py,sha256=Z4qpNmxSsxUJgnqJtw8cYWJTS6SxhRR7F7eX_akyVCU,2427
|
5
5
|
hie_rag/split.py,sha256=gEQVt57xWruT5e1psgSOnwuBrQngzri3S4H6ZvKzsw4,5082
|
6
6
|
hie_rag/split_and_process.py,sha256=PkFlnOF7nW4Zs47JTsGF4AY9VDOXz1AtxG9Die8_mQk,572
|
7
7
|
hie_rag/tree_index.py,sha256=iTa25ohMv5O0HYc5JtzIzVAIhNdVklYiAIJvqyE8sbM,2722
|
8
|
-
hie_rag/utils.py,sha256=
|
8
|
+
hie_rag/utils.py,sha256=_4TGiHuJ-Xo4JEEEdOjp4d1zxw6dNVsxROcom-vr7uU,4059
|
9
9
|
hie_rag/vectordb.py,sha256=iI73ujrONjDaHU66RNdHnD2PZWSppnjm0isIHPJEGAY,11068
|
10
|
-
hie_rag-0.2.
|
11
|
-
hie_rag-0.2.
|
12
|
-
hie_rag-0.2.
|
13
|
-
hie_rag-0.2.
|
14
|
-
hie_rag-0.2.
|
10
|
+
hie_rag-0.2.3.dist-info/licenses/LICENSE,sha256=IwAxruLb1UG8F0KZtfnV6MJq10FRAxWM-XOTWkWsJt4,632
|
11
|
+
hie_rag-0.2.3.dist-info/METADATA,sha256=QkUjUo8zbEa7Ijm6mWOkmo-hoLzcYPDOQqzeqR7iL8g,1698
|
12
|
+
hie_rag-0.2.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
13
|
+
hie_rag-0.2.3.dist-info/top_level.txt,sha256=tN2S3VpMUl6oLWL9sN4xIh4o2na_zjnW8rHiwPFf0T8,8
|
14
|
+
hie_rag-0.2.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|