hie-rag 0.2.2__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hie_rag/hie_rag.py CHANGED
@@ -16,7 +16,7 @@ class HieRag:
16
16
  def process_and_save_index_stream(self, file_name: str, uploaded_file: bytes, min_chunk_size, max_chunk_size):
17
17
  yield {"status": "🔍 Extracting text..."}
18
18
  print(f"Extracting text from {file_name}")
19
- extracted_text = self.utils.extract_text(uploaded_file)
19
+ extracted_text = self.utils.extract_text(uploaded_file, file_name=file_name)
20
20
 
21
21
  yield {"status": "✂️ Splitting into chunks..."}
22
22
  print(f"Splitting text into chunks with min size {min_chunk_size} and max size {max_chunk_size}")
hie_rag/utils.py CHANGED
@@ -16,36 +16,26 @@ class Utils:
16
16
  # self.client = OpenAI(api_key=api_key)
17
17
  self.client = AiClient(base_url=base_url)
18
18
 
19
- def extract_text(self, uploaded_file: bytes):
19
+ def extract_text(self, uploaded_bytes: bytes, file_name: str):
20
20
  """Extract text from an uploaded file using MarkItDown."""
21
- # md = MarkItDown(llm_client=self.client, llm_model="gpt-4o")
22
21
  md = MarkItDown()
23
22
 
24
- # Accept both raw bytes and file-like objects with `.read()`
25
- if isinstance(uploaded_file, bytes):
26
- file_bytes = uploaded_file
27
- suffix = ".bin" # fallback generic extension
28
- elif hasattr(uploaded_file, "read"):
29
- file_bytes = uploaded_file.read()
30
- filename = getattr(uploaded_file, "name", None) or getattr(uploaded_file, "filename", None)
31
- suffix = os.path.splitext(filename)[-1] if filename else ".bin"
32
- else:
33
- raise TypeError("Unsupported file type: must be bytes or file-like object")
34
-
35
- # Write to temp file for MarkItDown to process
36
- with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
37
- temp_file_path = temp_file.name
38
- temp_file.write(file_bytes)
23
+ # derive a real suffix from the filename
24
+ suffix = os.path.splitext(file_name)[1].lower() or ".txt"
25
+
26
+ # write to temp file
27
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
28
+ tmp.write(uploaded_bytes)
29
+ tmp_path = tmp.name
39
30
 
40
31
  try:
41
- # Redirect stderr to suppress native print warnings like "CropBox missing"
42
32
  with contextlib.redirect_stderr(io.StringIO()):
43
- extracted_text = md.convert(temp_file_path)
33
+ result = md.convert(tmp_path)
44
34
  finally:
45
- # Clean up the temporary file
46
- os.remove(temp_file_path)
35
+ os.remove(tmp_path)
47
36
 
48
- return extracted_text.text_content
37
+ # depending on MarkItDown version this may return a str or an object
38
+ return getattr(result, "text_content", result)
49
39
 
50
40
  def count_tokens(self, text: str, encoding="cl100k_base") -> int:
51
41
  """Count tokens in text using tiktoken"""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hie_rag
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: A hierarchical RAG framework for chunks retrieval.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -1,14 +1,14 @@
1
1
  hie_rag/__init__.py,sha256=p2glSTkCqGvMlcivcuKBStFh2C5adojaC9aGmF6nbhY,358
2
2
  hie_rag/ai_client.py,sha256=VbGQ0e3vZNn8W2YoR15Vvq2r-MUs-TBRNLGiImT4QxU,1000
3
- hie_rag/hie_rag.py,sha256=Nl_1WZM9IWhpNyZMvPzsae_u_xaCWEwrJgorZV-hp20,2741
3
+ hie_rag/hie_rag.py,sha256=6WG5TyX3w_2jm1lIWFgCn-J8lElPOmjqNfUKe7akAI0,2762
4
4
  hie_rag/process.py,sha256=Z4qpNmxSsxUJgnqJtw8cYWJTS6SxhRR7F7eX_akyVCU,2427
5
5
  hie_rag/split.py,sha256=gEQVt57xWruT5e1psgSOnwuBrQngzri3S4H6ZvKzsw4,5082
6
6
  hie_rag/split_and_process.py,sha256=PkFlnOF7nW4Zs47JTsGF4AY9VDOXz1AtxG9Die8_mQk,572
7
7
  hie_rag/tree_index.py,sha256=iTa25ohMv5O0HYc5JtzIzVAIhNdVklYiAIJvqyE8sbM,2722
8
- hie_rag/utils.py,sha256=GwGiQj-zc8-U9UXOFHTKkjHWHx8YTYquR27gsXJgzCE,4687
8
+ hie_rag/utils.py,sha256=_4TGiHuJ-Xo4JEEEdOjp4d1zxw6dNVsxROcom-vr7uU,4059
9
9
  hie_rag/vectordb.py,sha256=iI73ujrONjDaHU66RNdHnD2PZWSppnjm0isIHPJEGAY,11068
10
- hie_rag-0.2.2.dist-info/licenses/LICENSE,sha256=IwAxruLb1UG8F0KZtfnV6MJq10FRAxWM-XOTWkWsJt4,632
11
- hie_rag-0.2.2.dist-info/METADATA,sha256=3vTI_zyvJxOOq8VrrchOAn0a7m8hwQPISnlholFi3u0,1698
12
- hie_rag-0.2.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
13
- hie_rag-0.2.2.dist-info/top_level.txt,sha256=tN2S3VpMUl6oLWL9sN4xIh4o2na_zjnW8rHiwPFf0T8,8
14
- hie_rag-0.2.2.dist-info/RECORD,,
10
+ hie_rag-0.2.3.dist-info/licenses/LICENSE,sha256=IwAxruLb1UG8F0KZtfnV6MJq10FRAxWM-XOTWkWsJt4,632
11
+ hie_rag-0.2.3.dist-info/METADATA,sha256=QkUjUo8zbEa7Ijm6mWOkmo-hoLzcYPDOQqzeqR7iL8g,1698
12
+ hie_rag-0.2.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
13
+ hie_rag-0.2.3.dist-info/top_level.txt,sha256=tN2S3VpMUl6oLWL9sN4xIh4o2na_zjnW8rHiwPFf0T8,8
14
+ hie_rag-0.2.3.dist-info/RECORD,,