hjxdl 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hdl/_version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '0.1.4'
16
- __version_tuple__ = version_tuple = (0, 1, 4)
15
+ __version__ = version = '0.1.5'
16
+ __version_tuple__ = version_tuple = (0, 1, 5)
hdl/utils/llm/extract.py CHANGED
@@ -13,13 +13,22 @@ class DocExtractor():
13
13
  ltp_model_path: str = None,
14
14
  lang: str = "chi_sim"
15
15
  ) -> None:
16
+ """Initialize the object with the specified LTP model path and language.
17
+
18
+ Args:
19
+ ltp_model_path (str): The file path to the LTP model. Default is None.
20
+ lang (str): The language to be used for processing. Default is "chi_sim".
21
+
22
+ Returns:
23
+ None
24
+ """
16
25
  self.ltp_model_path = ltp_model_path
17
26
  self.lang = lang
18
27
 
19
28
  self.split = None
20
29
  if self.ltp_model_path is not None:
21
30
  from ltp import StnSplit, LTP
22
- ltp = LTP(ltp_model)
31
+ ltp = LTP(self.ltp_model_path)
23
32
  self.split = StnSplit()
24
33
  # sents = self.split.split(text)
25
34
 
@@ -38,6 +47,14 @@ class DocExtractor():
38
47
  def text_from_plain(
39
48
  txt_path
40
49
  ):
50
+ """Reads and returns the text content from a plain text file.
51
+
52
+ Args:
53
+ txt_path (str): The path to the plain text file.
54
+
55
+ Returns:
56
+ str: The text content read from the file.
57
+ """
41
58
  with open(txt_path, "r") as f:
42
59
  text = f.read()
43
60
  return text
@@ -46,13 +63,29 @@ class DocExtractor():
46
63
  def extract_text_from_image(
47
64
  image: Image.Image,
48
65
  ) -> str:
66
+ """Extracts text from the given image using pytesseract.
67
+
68
+ Args:
69
+ image (PIL.Image.Image): The input image from which text needs to be extracted.
70
+
71
+ Returns:
72
+ str: The extracted text from the image.
73
+ """
49
74
  return pytesseract.image_to_string(image, lang=self.lang)
50
75
 
51
76
  @staticmethod
52
77
  def is_within_bbox(
53
78
  bbox1, bbox2
54
79
  ):
55
- """Check if bbox1 is within bbox2."""
80
+ """Check if bbox1 is within bbox2.
81
+
82
+ Args:
83
+ bbox1 (list): List of 4 integers representing the bounding box coordinates [x_min, y_min, x_max, y_max].
84
+ bbox2 (list): List of 4 integers representing the bounding box coordinates [x_min, y_min, x_max, y_max].
85
+
86
+ Returns:
87
+ bool: True if bbox1 is within bbox2, False otherwise.
88
+ """
56
89
  return bbox1[0] >= bbox2[0] and bbox1[1] >= bbox2[1] and bbox1[2] <= bbox2[2] and bbox1[3] <= bbox2[3]
57
90
 
58
91
  def text_tables_from_pdf(
@@ -60,6 +93,15 @@ class DocExtractor():
60
93
  pdf_path,
61
94
  table_from_pic: bool = False
62
95
  ):
96
+ """Extract text and tables from a PDF file.
97
+
98
+ Args:
99
+ pdf_path (str): Path to the PDF file.
100
+ table_from_pic (bool, optional): Whether to extract tables from images in the PDF. Defaults to False.
101
+
102
+ Returns:
103
+ tuple: A tuple containing a list of extracted texts and a list of extracted tables as DataFrames.
104
+ """
63
105
  all_tables = []
64
106
  all_texts = []
65
107
  with pdfplumber.open(pdf_path) as pdf:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: hjxdl
3
- Version: 0.1.4
3
+ Version: 0.1.5
4
4
  Summary: A collection of functions for Jupyter notebooks
5
5
  Home-page: https://github.com/huluxiaohuowa/hdl
6
6
  Author: Jianxing Hu
@@ -1,5 +1,5 @@
1
1
  hdl/__init__.py,sha256=5sZZNySv08wwfzJcSDssGTqUn9wlmDsR6R4XB8J8mFM,70
2
- hdl/_version.py,sha256=9GTNkADgEYZ6fEjCvZZUdKyqxiPIgtskLFZNJz7nq_U,411
2
+ hdl/_version.py,sha256=zBVX2byWL6NrFlwjvahpnvSqDsdtebZW0K9WM_cj20U,411
3
3
  hdl/args/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  hdl/args/loss_args.py,sha256=s7YzSdd7IjD24rZvvOrxLLFqMZQb9YylxKeyelSdrTk,70
5
5
  hdl/controllers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -86,10 +86,10 @@ hdl/utils/general/glob.py,sha256=8-RCnt6L297wMIfn34ZAMCsGCZUjHG3MGglGZI1cX0g,491
86
86
  hdl/utils/llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
87
87
  hdl/utils/llm/chat.py,sha256=H2c8assJlSdZQKIfPkYrVZHqv66TsdsxtaLXv0kNe1w,11565
88
88
  hdl/utils/llm/embs.py,sha256=sC8tga7HgDwPI2m7TDWKp9kkxEIMxEyMtgmEhfRi4vI,6362
89
- hdl/utils/llm/extract.py,sha256=qlthQiFQm5DfHDzimjQKotzLB7oPk5UTODsw22pzs80,4891
89
+ hdl/utils/llm/extract.py,sha256=WbTlQmcPNfrKmzSZSKSdWLA0LqLAgoa4J_IcjxBXACI,6506
90
90
  hdl/utils/schedulers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
91
91
  hdl/utils/schedulers/norm_lr.py,sha256=bDwCmdEK-WkgxQMFBiMuchv8Mm7C0-GZJ6usm-PQk14,4461
92
- hjxdl-0.1.4.dist-info/METADATA,sha256=SCG5RpSG11LK0MmcU22aJOOV5Dmh_D7cCgi6kYaSnd0,542
93
- hjxdl-0.1.4.dist-info/WHEEL,sha256=-oYQCr74JF3a37z2nRlQays_SX2MqOANoqVjBBAP2yE,91
94
- hjxdl-0.1.4.dist-info/top_level.txt,sha256=-kxwTM5JPhylp06z3zAVO3w6_h7wtBfBo2zgM6YZoTk,4
95
- hjxdl-0.1.4.dist-info/RECORD,,
92
+ hjxdl-0.1.5.dist-info/METADATA,sha256=bcWpSx6Y2t3rb8n8Ry--JImcj0wIJh6CcZRDdE3xzc8,542
93
+ hjxdl-0.1.5.dist-info/WHEEL,sha256=-oYQCr74JF3a37z2nRlQays_SX2MqOANoqVjBBAP2yE,91
94
+ hjxdl-0.1.5.dist-info/top_level.txt,sha256=-kxwTM5JPhylp06z3zAVO3w6_h7wtBfBo2zgM6YZoTk,4
95
+ hjxdl-0.1.5.dist-info/RECORD,,
File without changes