content-extraction 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,36 @@
1
+ #!/bin/bash
2
+
3
+ # Fail immediately if any command fails
4
+ set -e
5
+
6
+ # Check if the correct number of arguments is provided
7
+ if [ "$#" -ne 2 ]; then
8
+ echo "Usage: $0 <input_file_path> <output_directory>"
9
+ exit 1
10
+ fi
11
+
12
+ INPUT_FILE="$1"
13
+ OUTPUT_DIR="$2"
14
+
15
+ # Step 1: Perform OCR and save the result to the directory
16
+ echo "Performing OCR on $INPUT_FILE..."
17
+ python -m content_extraction.do_ocr "$INPUT_FILE" -o "$OUTPUT_DIR"
18
+
19
+ # Step 2: Combine the OCR pages into a single file
20
+ echo "Combining pages into a single Markdown file..."
21
+ cd "$OUTPUT_DIR"
22
+ ls page-*.md | sort | xargs -I{} sh -c 'cat "{}"; echo; echo' > combined.md
23
+
24
+ # Step 3: Extract headings from the combined Markdown file
25
+ echo "Extracting headings from combined.md..."
26
+ grep "^#" combined.md > headings.md
27
+
28
+ # Step 4: Fix any OCR errors using provided script
29
+ echo "Fixing OCR errors..."
30
+ fixed_text=$(python -m content_extraction.fix_ocr combined.md headings.md)
31
+ echo "$fixed_text" > fixed.md
32
+
33
+ # Step 5: Render the markdown file to HTML
34
+ pandoc fixed.md -s -f markdown -t html -o index.html
35
+
36
+ echo "All processes completed successfully. Output saved in $OUTPUT_DIR"
@@ -1,8 +1,8 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content_extraction
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: Project dedicated to content extraction from unstructured files that contain some useful information.
5
- Requires-Python: >=3.13
5
+ Requires-Python: >=3.12
6
6
  Description-Content-Type: text/markdown
7
7
  Requires-Dist: beautifulsoup4>=4.13.4
8
8
  Requires-Dist: lxml>=6.0.0
@@ -8,9 +8,10 @@ content_extraction/fix_ocr.py,sha256=2xJ4c3VsGSy1l-qAukvhaV8QOp6yu5BY99Gb0DwamWQ
8
8
  content_extraction/logging_config.py,sha256=GN1wuJJEspQ3z-FZIg134obsHweuiicZfz2an13a9_I,296
9
9
  content_extraction/parse_html.py,sha256=mOrZKXX59YcdWWhmbnoTnfXpwrg0znk38x0DMJIVes8,3137
10
10
  content_extraction/process.py,sha256=iLcmSjWhEg_DbgnftnVIfybIeLCuTEI57gasot0MtDk,1809
11
+ content_extraction/process_document.sh,sha256=QbQOrV7isiEyxin1PBNGYmCbfVQ_eW-JgsbuQV4VB2o,1106
11
12
  content_extraction/semantic_chunk_html.py,sha256=iJPspKkrt95lL46JpC_9fgT8GfV8cz04TWEnU99rbBw,5786
12
13
  content_extraction/split_and_create_digest.py,sha256=bKZL9Axc74zLH_VrlNjd46ZiVTQQrAY5iNJCotO-8v8,4253
13
- content_extraction-0.2.0.dist-info/METADATA,sha256=O0GNY4Ksv4Idb5pzCbQtYc0upufn_JoadEukGJRSUc0,6201
14
- content_extraction-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
- content_extraction-0.2.0.dist-info/top_level.txt,sha256=a0I0EwSzsyd3p_aAENozn9i4I3aBn12XtrbqIvfzZec,19
16
- content_extraction-0.2.0.dist-info/RECORD,,
14
+ content_extraction-0.3.0.dist-info/METADATA,sha256=j0aBHEpJ1JTarADNp-2anMD8BKpuZV5Gj45sZ8h9u4I,6201
15
+ content_extraction-0.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
16
+ content_extraction-0.3.0.dist-info/top_level.txt,sha256=a0I0EwSzsyd3p_aAENozn9i4I3aBn12XtrbqIvfzZec,19
17
+ content_extraction-0.3.0.dist-info/RECORD,,