litparser 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- litparser-0.5.0/PKG-INFO +118 -0
- litparser-0.5.0/README.md +85 -0
- litparser-0.5.0/litparser.egg-info/PKG-INFO +118 -0
- litparser-0.5.0/litparser.egg-info/SOURCES.txt +9 -0
- litparser-0.5.0/litparser.egg-info/dependency_links.txt +1 -0
- litparser-0.5.0/litparser.egg-info/entry_points.txt +2 -0
- litparser-0.5.0/litparser.egg-info/requires.txt +4 -0
- litparser-0.5.0/litparser.egg-info/top_level.txt +1 -0
- litparser-0.5.0/pyproject.toml +51 -0
- litparser-0.5.0/setup.cfg +4 -0
- litparser-0.5.0/setup.py +51 -0
litparser-0.5.0/PKG-INFO
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: litparser
|
|
3
|
+
Version: 0.5.0
|
|
4
|
+
Summary: Lightweight Document Parser - 순수 Python으로 PDF, DOCX, PPTX, HWPX 파싱
|
|
5
|
+
Home-page: https://github.com/yourusername/litparser
|
|
6
|
+
Author: Your Name
|
|
7
|
+
Author-email: ironwung <ironwung@gmail.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
Project-URL: Homepage, https://github.com/ironwung/litparser
|
|
10
|
+
Project-URL: Documentation, https://github.com/ironwung/litparser#readme
|
|
11
|
+
Project-URL: Repository, https://github.com/ironwung/litparser
|
|
12
|
+
Keywords: pdf,parser,docx,pptx,hwpx,document,text-extraction,lightweight
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: Topic :: Text Processing
|
|
24
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
25
|
+
Requires-Python: >=3.8
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
29
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
30
|
+
Dynamic: author
|
|
31
|
+
Dynamic: home-page
|
|
32
|
+
Dynamic: requires-python
|
|
33
|
+
|
|
34
|
+
# LitParser
|
|
35
|
+
|
|
36
|
+
**Lit**eweight Document **Parser** - 순수 Python으로 구현된 문서 파서.
|
|
37
|
+
|
|
38
|
+
**외부 라이브러리 없이** PDF, DOCX, PPTX, HWPX 파일을 파싱합니다.
|
|
39
|
+
|
|
40
|
+
## 특징
|
|
41
|
+
|
|
42
|
+
- ✅ **Zero Dependencies** - 표준 라이브러리만 사용
|
|
43
|
+
- ✅ **다양한 포맷** - PDF, DOCX, PPTX, HWPX, TXT, MD
|
|
44
|
+
- ✅ **텍스트 추출** - 위치 정보 포함
|
|
45
|
+
- ✅ **테이블 감지** - 마크다운 변환
|
|
46
|
+
- ✅ **이미지 추출** - PNG, JPEG, JP2
|
|
47
|
+
- ✅ **출력 포맷** - Markdown, JSON
|
|
48
|
+
|
|
49
|
+
## 설치
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
pip install litparser
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
소스에서:
|
|
56
|
+
```bash
|
|
57
|
+
pip install -e .
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## CLI
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
# 텍스트 추출
|
|
64
|
+
litparser document.pdf
|
|
65
|
+
litparser report.docx
|
|
66
|
+
|
|
67
|
+
# 마크다운 변환
|
|
68
|
+
litparser document.pdf --markdown
|
|
69
|
+
litparser document.pdf --md -o result.md
|
|
70
|
+
|
|
71
|
+
# JSON 변환
|
|
72
|
+
litparser document.pdf --json
|
|
73
|
+
litparser document.pdf --json --include-images
|
|
74
|
+
|
|
75
|
+
# 테이블
|
|
76
|
+
litparser document.pdf --tables
|
|
77
|
+
|
|
78
|
+
# 분석
|
|
79
|
+
litparser document.pdf --analyze
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Python API
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
from litparser import parse_pdf, extract_text, extract_tables
|
|
86
|
+
|
|
87
|
+
# PDF 파싱
|
|
88
|
+
doc = parse_pdf('document.pdf')
|
|
89
|
+
|
|
90
|
+
# 텍스트
|
|
91
|
+
text = extract_text(doc, page_num=0)
|
|
92
|
+
|
|
93
|
+
# 테이블
|
|
94
|
+
tables = extract_tables(doc, page_num=0)
|
|
95
|
+
for t in tables:
|
|
96
|
+
print(t.to_markdown())
|
|
97
|
+
|
|
98
|
+
# 마크다운/JSON 변환
|
|
99
|
+
from litparser.output_formatter import pdf_to_output, to_markdown, to_json
|
|
100
|
+
|
|
101
|
+
output = pdf_to_output(doc)
|
|
102
|
+
md = to_markdown(output)
|
|
103
|
+
json_str = to_json(output)
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## 지원 포맷
|
|
107
|
+
|
|
108
|
+
| 포맷 | 확장자 | 텍스트 | 테이블 | 이미지 |
|
|
109
|
+
|------|--------|--------|--------|--------|
|
|
110
|
+
| PDF | .pdf | ✅ | ✅ | ✅ |
|
|
111
|
+
| Word | .docx | ✅ | ✅ | ✅ |
|
|
112
|
+
| PowerPoint | .pptx | ✅ | ✅ | ✅ |
|
|
113
|
+
| 한글 | .hwpx | ✅ | ✅ | ✅ |
|
|
114
|
+
| 텍스트 | .txt, .md | ✅ | - | - |
|
|
115
|
+
|
|
116
|
+
## 라이선스
|
|
117
|
+
|
|
118
|
+
MIT License
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# LitParser
|
|
2
|
+
|
|
3
|
+
**Lit**eweight Document **Parser** - 순수 Python으로 구현된 문서 파서.
|
|
4
|
+
|
|
5
|
+
**외부 라이브러리 없이** PDF, DOCX, PPTX, HWPX 파일을 파싱합니다.
|
|
6
|
+
|
|
7
|
+
## 특징
|
|
8
|
+
|
|
9
|
+
- ✅ **Zero Dependencies** - 표준 라이브러리만 사용
|
|
10
|
+
- ✅ **다양한 포맷** - PDF, DOCX, PPTX, HWPX, TXT, MD
|
|
11
|
+
- ✅ **텍스트 추출** - 위치 정보 포함
|
|
12
|
+
- ✅ **테이블 감지** - 마크다운 변환
|
|
13
|
+
- ✅ **이미지 추출** - PNG, JPEG, JP2
|
|
14
|
+
- ✅ **출력 포맷** - Markdown, JSON
|
|
15
|
+
|
|
16
|
+
## 설치
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install litparser
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
소스에서:
|
|
23
|
+
```bash
|
|
24
|
+
pip install -e .
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## CLI
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
# 텍스트 추출
|
|
31
|
+
litparser document.pdf
|
|
32
|
+
litparser report.docx
|
|
33
|
+
|
|
34
|
+
# 마크다운 변환
|
|
35
|
+
litparser document.pdf --markdown
|
|
36
|
+
litparser document.pdf --md -o result.md
|
|
37
|
+
|
|
38
|
+
# JSON 변환
|
|
39
|
+
litparser document.pdf --json
|
|
40
|
+
litparser document.pdf --json --include-images
|
|
41
|
+
|
|
42
|
+
# 테이블
|
|
43
|
+
litparser document.pdf --tables
|
|
44
|
+
|
|
45
|
+
# 분석
|
|
46
|
+
litparser document.pdf --analyze
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Python API
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from litparser import parse_pdf, extract_text, extract_tables
|
|
53
|
+
|
|
54
|
+
# PDF 파싱
|
|
55
|
+
doc = parse_pdf('document.pdf')
|
|
56
|
+
|
|
57
|
+
# 텍스트
|
|
58
|
+
text = extract_text(doc, page_num=0)
|
|
59
|
+
|
|
60
|
+
# 테이블
|
|
61
|
+
tables = extract_tables(doc, page_num=0)
|
|
62
|
+
for t in tables:
|
|
63
|
+
print(t.to_markdown())
|
|
64
|
+
|
|
65
|
+
# 마크다운/JSON 변환
|
|
66
|
+
from litparser.output_formatter import pdf_to_output, to_markdown, to_json
|
|
67
|
+
|
|
68
|
+
output = pdf_to_output(doc)
|
|
69
|
+
md = to_markdown(output)
|
|
70
|
+
json_str = to_json(output)
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## 지원 포맷
|
|
74
|
+
|
|
75
|
+
| 포맷 | 확장자 | 텍스트 | 테이블 | 이미지 |
|
|
76
|
+
|------|--------|--------|--------|--------|
|
|
77
|
+
| PDF | .pdf | ✅ | ✅ | ✅ |
|
|
78
|
+
| Word | .docx | ✅ | ✅ | ✅ |
|
|
79
|
+
| PowerPoint | .pptx | ✅ | ✅ | ✅ |
|
|
80
|
+
| 한글 | .hwpx | ✅ | ✅ | ✅ |
|
|
81
|
+
| 텍스트 | .txt, .md | ✅ | - | - |
|
|
82
|
+
|
|
83
|
+
## 라이선스
|
|
84
|
+
|
|
85
|
+
MIT License
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: litparser
|
|
3
|
+
Version: 0.5.0
|
|
4
|
+
Summary: Lightweight Document Parser - 순수 Python으로 PDF, DOCX, PPTX, HWPX 파싱
|
|
5
|
+
Home-page: https://github.com/yourusername/litparser
|
|
6
|
+
Author: Your Name
|
|
7
|
+
Author-email: ironwung <ironwung@gmail.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
Project-URL: Homepage, https://github.com/ironwung/litparser
|
|
10
|
+
Project-URL: Documentation, https://github.com/ironwung/litparser#readme
|
|
11
|
+
Project-URL: Repository, https://github.com/ironwung/litparser
|
|
12
|
+
Keywords: pdf,parser,docx,pptx,hwpx,document,text-extraction,lightweight
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: Topic :: Text Processing
|
|
24
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
25
|
+
Requires-Python: >=3.8
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
29
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
30
|
+
Dynamic: author
|
|
31
|
+
Dynamic: home-page
|
|
32
|
+
Dynamic: requires-python
|
|
33
|
+
|
|
34
|
+
# LitParser
|
|
35
|
+
|
|
36
|
+
**Lit**eweight Document **Parser** - 순수 Python으로 구현된 문서 파서.
|
|
37
|
+
|
|
38
|
+
**외부 라이브러리 없이** PDF, DOCX, PPTX, HWPX 파일을 파싱합니다.
|
|
39
|
+
|
|
40
|
+
## 특징
|
|
41
|
+
|
|
42
|
+
- ✅ **Zero Dependencies** - 표준 라이브러리만 사용
|
|
43
|
+
- ✅ **다양한 포맷** - PDF, DOCX, PPTX, HWPX, TXT, MD
|
|
44
|
+
- ✅ **텍스트 추출** - 위치 정보 포함
|
|
45
|
+
- ✅ **테이블 감지** - 마크다운 변환
|
|
46
|
+
- ✅ **이미지 추출** - PNG, JPEG, JP2
|
|
47
|
+
- ✅ **출력 포맷** - Markdown, JSON
|
|
48
|
+
|
|
49
|
+
## 설치
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
pip install litparser
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
소스에서:
|
|
56
|
+
```bash
|
|
57
|
+
pip install -e .
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## CLI
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
# 텍스트 추출
|
|
64
|
+
litparser document.pdf
|
|
65
|
+
litparser report.docx
|
|
66
|
+
|
|
67
|
+
# 마크다운 변환
|
|
68
|
+
litparser document.pdf --markdown
|
|
69
|
+
litparser document.pdf --md -o result.md
|
|
70
|
+
|
|
71
|
+
# JSON 변환
|
|
72
|
+
litparser document.pdf --json
|
|
73
|
+
litparser document.pdf --json --include-images
|
|
74
|
+
|
|
75
|
+
# 테이블
|
|
76
|
+
litparser document.pdf --tables
|
|
77
|
+
|
|
78
|
+
# 분석
|
|
79
|
+
litparser document.pdf --analyze
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Python API
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
from litparser import parse_pdf, extract_text, extract_tables
|
|
86
|
+
|
|
87
|
+
# PDF 파싱
|
|
88
|
+
doc = parse_pdf('document.pdf')
|
|
89
|
+
|
|
90
|
+
# 텍스트
|
|
91
|
+
text = extract_text(doc, page_num=0)
|
|
92
|
+
|
|
93
|
+
# 테이블
|
|
94
|
+
tables = extract_tables(doc, page_num=0)
|
|
95
|
+
for t in tables:
|
|
96
|
+
print(t.to_markdown())
|
|
97
|
+
|
|
98
|
+
# 마크다운/JSON 변환
|
|
99
|
+
from litparser.output_formatter import pdf_to_output, to_markdown, to_json
|
|
100
|
+
|
|
101
|
+
output = pdf_to_output(doc)
|
|
102
|
+
md = to_markdown(output)
|
|
103
|
+
json_str = to_json(output)
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## 지원 포맷
|
|
107
|
+
|
|
108
|
+
| 포맷 | 확장자 | 텍스트 | 테이블 | 이미지 |
|
|
109
|
+
|------|--------|--------|--------|--------|
|
|
110
|
+
| PDF | .pdf | ✅ | ✅ | ✅ |
|
|
111
|
+
| Word | .docx | ✅ | ✅ | ✅ |
|
|
112
|
+
| PowerPoint | .pptx | ✅ | ✅ | ✅ |
|
|
113
|
+
| 한글 | .hwpx | ✅ | ✅ | ✅ |
|
|
114
|
+
| 텍스트 | .txt, .md | ✅ | - | - |
|
|
115
|
+
|
|
116
|
+
## 라이선스
|
|
117
|
+
|
|
118
|
+
MIT License
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "litparser"
|
|
7
|
+
version = "0.5.0"
|
|
8
|
+
description = "Lightweight Document Parser - 순수 Python으로 PDF, DOCX, PPTX, HWPX 파싱"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
authors = [
|
|
12
|
+
{name = "ironwung", email = "ironwung@gmail.com"}
|
|
13
|
+
]
|
|
14
|
+
keywords = ["pdf", "parser", "docx", "pptx", "hwpx", "document", "text-extraction", "lightweight"]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Development Status :: 4 - Beta",
|
|
17
|
+
"Intended Audience :: Developers",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Operating System :: OS Independent",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.8",
|
|
22
|
+
"Programming Language :: Python :: 3.9",
|
|
23
|
+
"Programming Language :: Python :: 3.10",
|
|
24
|
+
"Programming Language :: Python :: 3.11",
|
|
25
|
+
"Programming Language :: Python :: 3.12",
|
|
26
|
+
"Topic :: Text Processing",
|
|
27
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
28
|
+
]
|
|
29
|
+
requires-python = ">=3.8"
|
|
30
|
+
dependencies = []
|
|
31
|
+
|
|
32
|
+
[project.optional-dependencies]
|
|
33
|
+
dev = [
|
|
34
|
+
"pytest>=7.0",
|
|
35
|
+
"pytest-cov>=4.0",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
[project.urls]
|
|
39
|
+
Homepage = "https://github.com/ironwung/litparser"
|
|
40
|
+
Documentation = "https://github.com/ironwung/litparser#readme"
|
|
41
|
+
Repository = "https://github.com/ironwung/litparser"
|
|
42
|
+
|
|
43
|
+
[project.scripts]
|
|
44
|
+
litparser = "litparser.__main__:main"
|
|
45
|
+
|
|
46
|
+
[tool.setuptools.packages.find]
|
|
47
|
+
where = ["."]
|
|
48
|
+
include = ["litparser*"]
|
|
49
|
+
|
|
50
|
+
[tool.pytest.ini_options]
|
|
51
|
+
testpaths = ["tests"]
|
litparser-0.5.0/setup.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LitParser - Lightweight Document Parser
|
|
3
|
+
pip install -e . 또는 python setup.py install
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from setuptools import setup, find_packages
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
readme = Path(__file__).parent / "README.md"
|
|
10
|
+
long_description = readme.read_text(encoding='utf-8') if readme.exists() else ""
|
|
11
|
+
|
|
12
|
+
setup(
|
|
13
|
+
name="litparser",
|
|
14
|
+
version="0.5.0",
|
|
15
|
+
description="Lightweight Document Parser - 순수 Python으로 PDF, DOCX, PPTX, HWPX 파싱",
|
|
16
|
+
long_description=long_description,
|
|
17
|
+
long_description_content_type="text/markdown",
|
|
18
|
+
author="Your Name",
|
|
19
|
+
author_email="your@email.com",
|
|
20
|
+
url="https://github.com/yourusername/litparser",
|
|
21
|
+
license="MIT",
|
|
22
|
+
|
|
23
|
+
packages=find_packages(include=['litparser', 'litparser.*']),
|
|
24
|
+
package_data={
|
|
25
|
+
'litparser': ['*.md'],
|
|
26
|
+
},
|
|
27
|
+
|
|
28
|
+
python_requires=">=3.8",
|
|
29
|
+
install_requires=[],
|
|
30
|
+
|
|
31
|
+
extras_require={
|
|
32
|
+
'dev': ['pytest>=7.0', 'pytest-cov>=4.0'],
|
|
33
|
+
},
|
|
34
|
+
|
|
35
|
+
entry_points={
|
|
36
|
+
'console_scripts': [
|
|
37
|
+
'litparser=litparser.__main__:main',
|
|
38
|
+
],
|
|
39
|
+
},
|
|
40
|
+
|
|
41
|
+
classifiers=[
|
|
42
|
+
"Development Status :: 4 - Beta",
|
|
43
|
+
"Intended Audience :: Developers",
|
|
44
|
+
"License :: OSI Approved :: MIT License",
|
|
45
|
+
"Operating System :: OS Independent",
|
|
46
|
+
"Programming Language :: Python :: 3",
|
|
47
|
+
"Topic :: Text Processing",
|
|
48
|
+
],
|
|
49
|
+
|
|
50
|
+
keywords="pdf parser docx pptx hwpx document lightweight",
|
|
51
|
+
)
|