content-extraction 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,50 @@
1
+ #!/usr/bin/env python3
2
+ import sys
3
+ import json
4
+ import logging
5
+ from typing import Iterable
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ def read_input(input_file: str | None = None) -> str:
11
+ """Read JSON content from a file or stdin and parse it."""
12
+ try:
13
+ if input_file:
14
+ with open(input_file, 'r', encoding='utf-8') as f:
15
+ content = f.read()
16
+ else:
17
+ content = sys.stdin.read()
18
+ except Exception as e:
19
+ logger.error(f'Error reading input from {input_file or "stdin"}', exc_info=True)
20
+ raise RuntimeError(f'Error reading input: {e}')
21
+
22
+ return content
23
+
24
+
25
+ def write_output(output: str, output_file: str | None = None):
26
+ try:
27
+ if output_file:
28
+ with open(output_file, 'w', encoding='utf-8') as f:
29
+ f.write(output)
30
+ else:
31
+ sys.stdout.write(output)
32
+ except IOError:
33
+ logger.error(f'Error writing to {output_file or "stdout"}', exc_info=True)
34
+ raise
35
+
36
+
37
+ def write_stream_of_obj(obj_stream: Iterable[dict], output_file: str | None = None):
38
+ try:
39
+ if output_file:
40
+ with open(output_file, 'w', encoding='utf-8') as f:
41
+ for obj in obj_stream:
42
+ f.write(json.dumps(obj))
43
+ f.write('\n')
44
+ else:
45
+ for obj in obj_stream:
46
+ sys.stdout.write(json.dumps(obj))
47
+ sys.stdout.write('\n')
48
+ except IOError:
49
+ logger.error(f'Error writing stream to {output_file or "stdout"}', exc_info=True)
50
+ raise
@@ -0,0 +1,199 @@
1
+ import os
2
+ import sys
3
+ import json
4
+ import logging
5
+ import subprocess
6
+ import base64
7
+ import requests
8
+ import argparse
9
+ from urllib.parse import urlparse
10
+
11
+ from content_extraction.logging_config import setup_logging
12
+
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def set_env_vars():
18
+ x = {
19
+ 'MODEL_ID': 'mistral-ocr-2505',
20
+ 'PROJECT_ID': 'edu-course-companion',
21
+ 'REGION': 'europe-west4',
22
+ }
23
+ for k, v in x.items():
24
+ os.environ[k] = v
25
+
26
+
27
+ def authenticate_and_get_token() -> str | None:
28
+ process = subprocess.Popen('gcloud auth print-access-token', stdout=subprocess.PIPE, shell=True)
29
+ (access_token_bytes, err) = process.communicate()
30
+ if err:
31
+ logger.error(f'Error getting access token: {err.decode("utf-8")}')
32
+ return None
33
+ access_token = access_token_bytes.decode('utf-8').strip()
34
+ return access_token
35
+
36
+
37
+ def build_url_to_model():
38
+ region = os.getenv('REGION')
39
+ project_id = os.getenv('PROJECT_ID')
40
+ model_id = os.getenv('MODEL_ID')
41
+
42
+ url = f'https://{region}-aiplatform.googleapis.com/v1/projects/{project_id}/locations/{region}/publishers/mistralai/models/{model_id}:rawPredict'
43
+ return url
44
+
45
+
46
+ def file_to_base64_string(file_object):
47
+ """
48
+ Converts a Python file-like object to a base64 encoded string.
49
+
50
+ Args:
51
+ file_object: A file-like object opened in binary read mode (e.g., 'rb').
52
+
53
+ Returns:
54
+ A string containing the base64 encoded content of the file.
55
+ """
56
+ encoded_bytes = base64.b64encode(file_object.read())
57
+ encoded_string = encoded_bytes.decode('utf-8')
58
+ return encoded_string
59
+
60
+
61
+ def build_data_url_from_file(filepath):
62
+ """Creates a data URL from a local file path."""
63
+ with open(filepath, 'rb') as file:
64
+ base64_pdf = file_to_base64_string(file)
65
+ # The API expects this specific format for data URLs
66
+ document_url = f'data:application/pdf;base64,{base64_pdf}'
67
+ return document_url
68
+
69
+
70
+ def build_payload(document_url):
71
+ model_id = os.getenv('MODEL_ID')
72
+ payload = {
73
+ 'model': model_id,
74
+ 'document': {
75
+ 'type': 'document_url',
76
+ 'document_url': document_url,
77
+ },
78
+ 'include_image_base64': True, # Request image content
79
+ }
80
+ return payload
81
+
82
+
83
+ def make_request(payload) -> dict | None:
84
+ logger.debug('[Authentication] started')
85
+ access_token = authenticate_and_get_token()
86
+ if access_token is None:
87
+ return None
88
+
89
+ logger.debug('[Authentication] successfull')
90
+ headers = {
91
+ 'Authorization': f'Bearer {access_token}',
92
+ 'Accept': 'application/json',
93
+ }
94
+
95
+ url = build_url_to_model()
96
+ logger.debug(f'[Request] started using URL: "{url}"')
97
+
98
+ response = requests.post(url=url, headers=headers, json=payload)
99
+ if response.status_code == 200:
100
+ try:
101
+ response_dict = response.json()
102
+ except json.JSONDecodeError as e:
103
+ logger.error(f'[Request] Error decoding JSON: {e}', extra={'response': response.text})
104
+ return None
105
+ else:
106
+ logger.error(
107
+ f'Request failed with status code: {response.status_code}',
108
+ extra={'response': response.text},
109
+ )
110
+ return None
111
+ logger.debug(f'[Request] completed using URL: "{url}"')
112
+ return response_dict
113
+
114
+
115
+ def save_response_to_disk(response_dict, output_dir):
116
+ logger.debug(f'[Saving to disk] started. Saving output to directory: {output_dir}.')
117
+ os.makedirs(output_dir, exist_ok=True)
118
+ for page in response_dict.get('pages', []):
119
+ logger.debug(f'[Saving to disk] started processing page {page["index"]}')
120
+ zfilled_index = str(page['index']).zfill(4)
121
+ page_filename = os.path.join(output_dir, f'page-{zfilled_index}.md')
122
+ with open(page_filename, 'w', encoding='utf-8') as f:
123
+ f.write(page['markdown'])
124
+ logger.debug(f'[Saving to disk] saved page "{page_filename}"')
125
+ logger.debug('[Saving to disk] started saving images')
126
+ for image in page.get('images', []):
127
+ logger.debug(f'[Saving to disk] started saving image {image["id"]}')
128
+ image_base64 = image['image_base64']
129
+ colon_index = image_base64.find(',')
130
+ if colon_index == -1:
131
+ logger.warning(f'Could not find comma in image_base64 for {image["id"]}, skipping.')
132
+ continue
133
+ encoded_image = image_base64[colon_index + 1 :]
134
+ image_bytes = base64.b64decode(encoded_image)
135
+ # image id already has the extension
136
+ image_filename = os.path.join(output_dir, image['id'])
137
+ with open(image_filename, 'wb') as f:
138
+ f.write(image_bytes)
139
+ logger.debug(f'[Saving to disk] completed saving image {image["id"]}')
140
+ logger.debug(f'[Saving to disk] completed processing page {page["index"]}')
141
+ logger.debug('[Saving to disk] completed')
142
+
143
+
144
+ def main():
145
+ """Main CLI entry point."""
146
+ setup_logging()
147
+ parser = argparse.ArgumentParser(
148
+ description='Extract text and images from a document using OCR.',
149
+ formatter_class=argparse.RawDescriptionHelpFormatter,
150
+ )
151
+ parser.add_argument('input_source', help='Input file path or URL to a document.')
152
+ parser.add_argument(
153
+ '-o',
154
+ '--output',
155
+ metavar='DIRECTORY',
156
+ help='Output directory to save pages and images. Defaults to a directory named after the input file.',
157
+ )
158
+ args = parser.parse_args()
159
+
160
+ set_env_vars()
161
+
162
+ input_source = args.input_source
163
+ output_dir = args.output
164
+
165
+ # Determine default output directory if not provided
166
+ if not output_dir:
167
+ if input_source.startswith(('http://', 'https://')):
168
+ parsed_url = urlparse(input_source)
169
+ filename = os.path.basename(parsed_url.path)
170
+ output_dir = os.path.splitext(filename)[0] if filename else 'ocr_output'
171
+ else:
172
+ output_dir = os.path.splitext(os.path.basename(input_source))[0]
173
+
174
+ logger.info(f'[Processing: {input_source}] Started!')
175
+ logger.info(f'Output will be saved to: {output_dir}')
176
+
177
+ if input_source.startswith(('http://', 'https://')):
178
+ # If the input is a URL, pass it directly.
179
+ document_url = input_source
180
+ else:
181
+ # If the input is a local file, check for existence and create a data URL.
182
+ if not os.path.exists(input_source):
183
+ logger.error(f"Error: Input file not found at '{input_source}'")
184
+ return 1
185
+ document_url = build_data_url_from_file(input_source)
186
+
187
+ payload = build_payload(document_url)
188
+ response_dict = make_request(payload)
189
+
190
+ if not response_dict:
191
+ logger.error('Failed to get a valid response from the OCR service.')
192
+ return 1
193
+
194
+ save_response_to_disk(response_dict, output_dir)
195
+ logger.info(f'[Processing: {input_source}] Completed successfully!')
196
+
197
+
198
+ if __name__ == '__main__':
199
+ sys.exit(main())
@@ -0,0 +1,24 @@
1
+ import dspy
2
+
3
+ lm = dspy.LM("openai/gpt-4o-mini", temperature=0.3, max_tokens=5000)
4
+ dspy.configure(lm=lm)
5
+
6
+
7
+ class CorrectHeadingLevelSignature(dspy.Signature):
8
+ """Correct heading levels. Main title should be H1, Chapter Titles H2, etc."""
9
+
10
+ headings: str = dspy.InputField(
11
+ description=r"String of headings extracted via OCR process, separated by \n"
12
+ )
13
+ corrected_headings: str = dspy.OutputField(
14
+ description="Headings with corrected level"
15
+ )
16
+
17
+
18
+ class CorrectHeadingLevel(dspy.Module):
19
+ def __init__(self):
20
+ self.predictor = dspy.ChainOfThought(CorrectHeadingLevelSignature)
21
+
22
+ def forward(self, headings):
23
+ prediction = self.predictor(headings=headings)
24
+ return prediction
@@ -0,0 +1,174 @@
1
+ import os
2
+ import argparse
3
+ import sys
4
+ import logging
5
+ from pptx import Presentation
6
+ from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
7
+
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ def extract_content(pptx_path: str, output_dir: str) -> str | None:
13
+ logger.info(f'Extracting content from {pptx_path} to {output_dir}')
14
+ """
15
+ Extracts text, tables, and images from a PPTX file and saves them
16
+ into an HTML file and an images directory.
17
+
18
+ Args:
19
+ pptx_path (str): The path to the input PowerPoint presentation.
20
+ output_dir (str): The directory where the output HTML and images will be saved.
21
+
22
+ Returns:
23
+ tuple[str, str] | tuple[None, None]: A tuple containing the path to the
24
+ output HTML file and the images directory, or (None, None) on failure.
25
+ """
26
+ logger.info(f'[Extracting content] started from {pptx_path}')
27
+ images_dir = os.path.join(output_dir, 'images')
28
+ html_out_path = os.path.join(output_dir, 'index.html')
29
+ logger.debug(f'[Extracting content] Images directory: {images_dir}, HTML output path: {html_out_path}')
30
+
31
+ # Ensure output directories exist
32
+ os.makedirs(images_dir, exist_ok=True)
33
+
34
+ try:
35
+ prs = Presentation(pptx_path)
36
+ except Exception:
37
+ logger.error(f'[Extracting content] Could not open or parse {pptx_path}')
38
+ return None
39
+
40
+ html_lines = [
41
+ '<!DOCTYPE html>',
42
+ '<html lang="en">',
43
+ '<head>',
44
+ ' <meta charset="UTF-8">',
45
+ ' <title>Extracted PPTX Content</title>',
46
+ '</head>',
47
+ '<body>',
48
+ ]
49
+
50
+ image_counter = 0
51
+
52
+ logger.debug('[Extracting content] stated processing slides')
53
+ for slide_idx, slide in enumerate(prs.slides, start=1):
54
+ logger.debug(f'[Extracting content] stated processing slide {slide_idx}')
55
+ html_lines.append(f' <section id="slide-{slide_idx}">')
56
+
57
+ # 1) Title (if any)
58
+ title_text = None
59
+ if slide.has_notes_slide: # Check for title in shapes first
60
+ for shape in slide.shapes:
61
+ if shape.is_placeholder and shape.placeholder_format.type == PP_PLACEHOLDER.TITLE:
62
+ title_text = shape.text_frame.text.strip()
63
+ break
64
+ if title_text:
65
+ html_lines.append(f' <h1>{title_text}</h1>')
66
+
67
+ # 2) Walk every shape
68
+ for shape in slide.shapes:
69
+ # -- TABLES --
70
+ if shape.shape_type == MSO_SHAPE_TYPE.TABLE:
71
+ html_lines.append(' <table border="1">')
72
+ table = shape.table
73
+ for row in table.rows:
74
+ html_lines.append(' <tr>')
75
+ for cell in row.cells:
76
+ cell_txt = cell.text.replace('\n', '<br/>')
77
+ html_lines.append(f' <td>{cell_txt}</td>')
78
+ html_lines.append(' </tr>')
79
+ html_lines.append(' </table>')
80
+
81
+ # -- IMAGES --
82
+ elif shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
83
+ image = shape.image
84
+ image_counter += 1
85
+ ext = image.ext # e.g. 'png', 'jpeg'
86
+ img_name = f'slide{slide_idx}_img{image_counter}.{ext}'
87
+ img_path = os.path.join(images_dir, img_name)
88
+ with open(img_path, 'wb') as f:
89
+ f.write(image.blob)
90
+ # Relative path for the src attribute
91
+ html_lines.append(f' <img src="images/{img_name}" alt="Slide {slide_idx} image"/>')
92
+
93
+ # -- TEXT (including bullets) --
94
+ elif shape.has_text_frame:
95
+ # skip re-printing the title placeholder
96
+ if shape.is_placeholder and shape.placeholder_format.type in (
97
+ PP_PLACEHOLDER.TITLE,
98
+ PP_PLACEHOLDER.SUBTITLE,
99
+ ):
100
+ continue
101
+
102
+ in_list = False
103
+ for para in shape.text_frame.paragraphs:
104
+ text = ''.join(run.text for run in para.runs).strip()
105
+ if not text:
106
+ continue
107
+
108
+ # any indent > 0 treat as a bullet
109
+ if para.level > 0:
110
+ if not in_list:
111
+ html_lines.append(' <ul>')
112
+ in_list = True
113
+ html_lines.append(f' <li>{text}</li>')
114
+ else:
115
+ if in_list:
116
+ html_lines.append(' </ul>')
117
+ in_list = False
118
+ html_lines.append(f' <p>{text}</p>')
119
+
120
+ if in_list:
121
+ html_lines.append(' </ul>')
122
+
123
+ html_lines.append(' </section>')
124
+ logger.debug(f'[Extracting content] completed processing slide {slide_idx}')
125
+
126
+ logger.debug('[Extracting content] completed processing slides')
127
+ html_lines.extend(['</body>', '</html>'])
128
+
129
+ # Write out the final HTML file
130
+ logger.debug('[Extracting content] started saving HTML file')
131
+ with open(html_out_path, 'w', encoding='utf-8') as f:
132
+ f.write('\n'.join(html_lines))
133
+ logger.debug('[Extracting content] completed saving HTML file')
134
+
135
+ logger.info(f'[Extracting content] completed from {pptx_path}')
136
+ return html_out_path
137
+
138
+
139
+ def main():
140
+ """Main function to handle command line arguments and execute the script."""
141
+ parser = argparse.ArgumentParser(
142
+ description='Extract content from a PowerPoint (PPTX) file to HTML.',
143
+ formatter_class=argparse.RawDescriptionHelpFormatter,
144
+ epilog="""
145
+ Examples:
146
+ %(prog)s presentation.pptx # Outputs to 'output/' directory by default
147
+ %(prog)s presentation.pptx -o extracted_content # Outputs to 'extracted_content/' directory
148
+ """,
149
+ )
150
+ parser.add_argument('pptx_file', help='Path to the input PPTX file.')
151
+ parser.add_argument(
152
+ '-o',
153
+ '--output',
154
+ default='output',
155
+ help="Path to the output directory (if not provided, defaults to 'output').",
156
+ )
157
+ args = parser.parse_args()
158
+
159
+ if not os.path.exists(args.pptx_file):
160
+ logger.error(f'Input file not found at {args.pptx_file}')
161
+ return 1
162
+
163
+ html_out, images_out = extract_content(args.pptx_file, args.output)
164
+
165
+ if html_out and images_out:
166
+ logger.info(f'Successfully extracted content to {html_out} with images in {images_out}/')
167
+ return 0
168
+
169
+ logger.error(f'Extraction failed for {args.pptx_file}.')
170
+ return 1
171
+
172
+
173
+ if __name__ == '__main__':
174
+ sys.exit(main())