flatfish 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. flatfish-0.1.0/.gitignore +85 -0
  2. flatfish-0.1.0/PKG-INFO +182 -0
  3. flatfish-0.1.0/README.md +136 -0
  4. flatfish-0.1.0/pyproject.toml +101 -0
  5. flatfish-0.1.0/src/flatfish/__init__.py +3 -0
  6. flatfish-0.1.0/src/flatfish/__main__.py +6 -0
  7. flatfish-0.1.0/src/flatfish/cli.py +792 -0
  8. flatfish-0.1.0/src/flatfish/config.py +280 -0
  9. flatfish-0.1.0/src/flatfish/dataset.py +182 -0
  10. flatfish-0.1.0/src/flatfish/htr/__init__.py +1 -0
  11. flatfish-0.1.0/src/flatfish/htr/engine.py +309 -0
  12. flatfish-0.1.0/src/flatfish/htr/models.py +291 -0
  13. flatfish-0.1.0/src/flatfish/pipeline.py +369 -0
  14. flatfish-0.1.0/src/flatfish/site/__init__.py +1 -0
  15. flatfish-0.1.0/src/flatfish/site/builder.py +1127 -0
  16. flatfish-0.1.0/src/flatfish/site/templates/base.html +16 -0
  17. flatfish-0.1.0/src/flatfish/site/templates/browse_dates.html +294 -0
  18. flatfish-0.1.0/src/flatfish/site/templates/browse_entities.html +316 -0
  19. flatfish-0.1.0/src/flatfish/site/templates/document.html +252 -0
  20. flatfish-0.1.0/src/flatfish/site/templates/index.html +17 -0
  21. flatfish-0.1.0/src/flatfish/site/templates/main.html +233 -0
  22. flatfish-0.1.0/src/flatfish/site/templates/overview/changes.html +232 -0
  23. flatfish-0.1.0/src/flatfish/site/templates/overview/questions.html +192 -0
  24. flatfish-0.1.0/src/flatfish/site/templates/overview/summary.html +188 -0
  25. flatfish-0.1.0/src/flatfish/site/templates/overview/timeline.html +205 -0
  26. flatfish-0.1.0/src/flatfish/site/templates/overview.html +20 -0
  27. flatfish-0.1.0/src/flatfish/summary/__init__.py +1 -0
  28. flatfish-0.1.0/src/flatfish/summary/qwen.py +1651 -0
  29. flatfish-0.1.0/src/flatfish/translation/__init__.py +15 -0
  30. flatfish-0.1.0/src/flatfish/translation/translator.py +226 -0
  31. flatfish-0.1.0/src/flatfish/utils/__init__.py +1 -0
  32. flatfish-0.1.0/src/flatfish/utils/dates.py +164 -0
  33. flatfish-0.1.0/src/flatfish/utils/images.py +127 -0
  34. flatfish-0.1.0/src/flatfish/utils/logging.py +66 -0
  35. flatfish-0.1.0/src/flatfish/utils/text.py +168 -0
  36. flatfish-0.1.0/tests/conftest.py +42 -0
  37. flatfish-0.1.0/tests/test_config.py +71 -0
  38. flatfish-0.1.0/tests/test_dates.py +117 -0
  39. flatfish-0.1.0/tests/test_text.py +179 -0
@@ -0,0 +1,85 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ *.egg-info/
24
+ .installed.cfg
25
+ *.egg
26
+
27
+ # PyInstaller
28
+ *.manifest
29
+ *.spec
30
+
31
+ # Installer logs
32
+ pip-log.txt
33
+ pip-delete-this-directory.txt
34
+
35
+ # Unit test / coverage reports
36
+ htmlcov/
37
+ .tox/
38
+ .nox/
39
+ .coverage
40
+ .coverage.*
41
+ .cache
42
+ nosetests.xml
43
+ coverage.xml
44
+ *.cover
45
+ *.py,cover
46
+ .hypothesis/
47
+ .pytest_cache/
48
+
49
+ # Translations
50
+ *.mo
51
+ *.pot
52
+
53
+ # Environments
54
+ .env
55
+ .venv
56
+ env/
57
+ venv/
58
+ ENV/
59
+ env.bak/
60
+ venv.bak/
61
+
62
+ # IDE
63
+ .idea/
64
+ .vscode/
65
+ *.swp
66
+ *.swo
67
+ *~
68
+
69
+ # Project specific
70
+ _site/
71
+ transcriptions/
72
+ entities/
73
+ summaries/
74
+ images/
75
+ *.dzi
76
+ tiles/
77
+
78
+ # macOS
79
+ .DS_Store
80
+
81
+ # Jupyter Notebook
82
+ .ipynb_checkpoints
83
+ _build/
84
+ docs/_build/
85
+ translations/
@@ -0,0 +1,182 @@
1
+ Metadata-Version: 2.4
2
+ Name: flatfish
3
+ Version: 0.1.0
4
+ Summary: Historical document analysis CLI - Extract, analyze, and present handwritten text from document images
5
+ Project-URL: Homepage, https://github.com/PULdischo/flatfish
6
+ Project-URL: Documentation, https://github.com/PULdischo/flatfish#readme
7
+ Project-URL: Repository, https://github.com/PULdischo/flatfish
8
+ Project-URL: Issues, https://github.com/PULdischo/flatfish/issues
9
+ Author-email: Andrew Janco <apjanco@gmail.com>
10
+ License-Expression: MIT
11
+ Keywords: document-analysis,handwriting,historical-documents,htr,named-entity-recognition,ocr
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Environment :: Console
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Scientific/Engineering :: Image Recognition
21
+ Classifier: Topic :: Text Processing
22
+ Requires-Python: >=3.10
23
+ Requires-Dist: datasets>=2.0
24
+ Requires-Dist: deep-translator>=1.11.0
25
+ Requires-Dist: httpx>=0.25
26
+ Requires-Dist: jinja2>=3.0
27
+ Requires-Dist: markdown>=3.0
28
+ Requires-Dist: netlify-python>=0.4.0
29
+ Requires-Dist: openai>=1.0
30
+ Requires-Dist: pillow>=10.0
31
+ Requires-Dist: pydantic-settings>=2.0
32
+ Requires-Dist: pydantic>=2.0
33
+ Requires-Dist: python-dotenv>=1.0
34
+ Requires-Dist: pyyaml>=6.0
35
+ Requires-Dist: rich>=13.0
36
+ Requires-Dist: tqdm>=4.0
37
+ Requires-Dist: typer[all]>=0.12
38
+ Provides-Extra: dev
39
+ Requires-Dist: black>=23.0; extra == 'dev'
40
+ Requires-Dist: mypy>=1.0; extra == 'dev'
41
+ Requires-Dist: pre-commit>=3.0; extra == 'dev'
42
+ Requires-Dist: pytest-cov>=4.0; extra == 'dev'
43
+ Requires-Dist: pytest>=7.0; extra == 'dev'
44
+ Requires-Dist: ruff>=0.1; extra == 'dev'
45
+ Description-Content-Type: text/markdown
46
+
47
+ <picture>
48
+ <source media="(prefers-color-scheme: dark)" srcset="logo-dark.png">
49
+ <source media="(prefers-color-scheme: light)" srcset="logo-light.png">
50
+ <img width="100" src="logo-light.png" alt="Flatfish Logo">
51
+ </picture>
52
+
53
+ # Flatfish
54
+
55
+ Historical document analysis CLI - Extract, analyze, and present handwritten text from document images.
56
+
57
+ ## Features
58
+
59
+ - 📜 **Handwritten Text Recognition (HTR)** - Extract text from historical document images
60
+ - 🏷️ **Named Entity Recognition** - Identify people, places, dates, and more with contextual descriptions
61
+ - 📊 **AI-Powered Summaries** - Generate timelines, track changes, and suggest research questions
62
+ - 🌐 **Static Website Builder** - Create searchable, browsable document collections
63
+
64
+ ## Installation
65
+
66
+ ```bash
67
+ pip install flatfish
68
+ ```
69
+
70
+ ## Quick Start
71
+
72
+ ```bash
73
+ # Initialize a new project
74
+ flatfish init
75
+
76
+ # Edit configuration
77
+ nano flatfish.yaml
78
+ nano .env
79
+
80
+ # Validate setup
81
+ flatfish validate
82
+
83
+ # Process documents
84
+ flatfish process
85
+
86
+ # Preview the site
87
+ flatfish publish
88
+ ```
89
+
90
+ ## Configuration
91
+
92
+ ### flatfish.yaml
93
+
94
+ ```yaml
95
+ dataset:
96
+ source: "username/dataset-name"
97
+ splits:
98
+ - "train"
99
+ image_column: "image"
100
+
101
+ processing:
102
+ extract_entities: true
103
+ entity_context: true
104
+
105
+ summary:
106
+ enabled: true
107
+ model: "qwen-vl-max"
108
+
109
+ website:
110
+ title: "Document Collection"
111
+ password: "changeme"
112
+ ```
113
+
114
+ ### .env
115
+
116
+ ```bash
117
+ HUGGINGFACE_TOKEN=hf_xxxxxxxxxxxxx
118
+ DASHSCOPE_API_KEY=sk-xxxxxxxxxxxxx
119
+ ```
120
+
121
+ ## Commands
122
+
123
+ | Command | Description |
124
+ |---------|-------------|
125
+ | `flatfish init` | Initialize a new project |
126
+ | `flatfish process` | Run the full pipeline |
127
+ | `flatfish extract` | Extract text from images only |
128
+ | `flatfish entities` | Extract entities only |
129
+ | `flatfish summarize` | Generate AI summary only |
130
+ | `flatfish build` | Build static site only |
131
+ | `flatfish serve` | Preview site locally |
132
+ | `flatfish deploy` | Deploy to Netlify |
133
+ | `flatfish status` | Show processing status |
134
+ | `flatfish validate` | Validate configuration |
135
+
136
+ ## Deployment .
137
+
138
+ Deploy your site to Netlify:
139
+
140
+ ```bash
141
+ # Install netlify-python
142
+ pip install netlify-python
143
+
144
+ # Set your Netlify token (get from https://app.netlify.com/user/applications)
145
+ export NETLIFY_TOKEN=your-token
146
+ export NETLIFY_SITE_ID=your-site-id
147
+
148
+ # Deploy a draft preview
149
+ flatfish deploy
150
+
151
+ # Deploy to production
152
+ flatfish deploy --prod
153
+
154
+ # Specify a site ID directly
155
+ flatfish deploy --prod --site your-site-id
156
+ ```
157
+
158
+ ## Output
159
+
160
+ ```
161
+ project/
162
+ ├── transcriptions/ # Extracted text files
163
+ ├── entities/ # Entity JSON files
164
+ ├── summaries/ # AI-generated summaries
165
+ └── _site/ # Built static website
166
+ ```
167
+
168
+ ## License
169
+
170
+ MIT
171
+
172
+ ## Disclosure of Delegation to Generative AI
173
+
174
+ The authors declare the use of generative AI in the research and writing process. According to the GAIDeT taxonomy (2025), the following tasks were delegated to GAI tools under full human supervision:
175
+
176
+ - Code generation
177
+ - Code optimization
178
+
179
+ The GAI tool used was: Claude Sonnet.
180
+ Responsibility for the final manuscript lies entirely with the authors.
181
+ GAI tools are not listed as authors and do not bear responsibility for the final outcomes.
182
+ Declaration submitted by: Andrew Janco
@@ -0,0 +1,136 @@
1
+ <picture>
2
+ <source media="(prefers-color-scheme: dark)" srcset="logo-dark.png">
3
+ <source media="(prefers-color-scheme: light)" srcset="logo-light.png">
4
+ <img width="100" src="logo-light.png" alt="Flatfish Logo">
5
+ </picture>
6
+
7
+ # Flatfish
8
+
9
+ Historical document analysis CLI - Extract, analyze, and present handwritten text from document images.
10
+
11
+ ## Features
12
+
13
+ - 📜 **Handwritten Text Recognition (HTR)** - Extract text from historical document images
14
+ - 🏷️ **Named Entity Recognition** - Identify people, places, dates, and more with contextual descriptions
15
+ - 📊 **AI-Powered Summaries** - Generate timelines, track changes, and suggest research questions
16
+ - 🌐 **Static Website Builder** - Create searchable, browsable document collections
17
+
18
+ ## Installation
19
+
20
+ ```bash
21
+ pip install flatfish
22
+ ```
23
+
24
+ ## Quick Start
25
+
26
+ ```bash
27
+ # Initialize a new project
28
+ flatfish init
29
+
30
+ # Edit configuration
31
+ nano flatfish.yaml
32
+ nano .env
33
+
34
+ # Validate setup
35
+ flatfish validate
36
+
37
+ # Process documents
38
+ flatfish process
39
+
40
+ # Preview the site
41
+ flatfish publish
42
+ ```
43
+
44
+ ## Configuration
45
+
46
+ ### flatfish.yaml
47
+
48
+ ```yaml
49
+ dataset:
50
+ source: "username/dataset-name"
51
+ splits:
52
+ - "train"
53
+ image_column: "image"
54
+
55
+ processing:
56
+ extract_entities: true
57
+ entity_context: true
58
+
59
+ summary:
60
+ enabled: true
61
+ model: "qwen-vl-max"
62
+
63
+ website:
64
+ title: "Document Collection"
65
+ password: "changeme"
66
+ ```
67
+
68
+ ### .env
69
+
70
+ ```bash
71
+ HUGGINGFACE_TOKEN=hf_xxxxxxxxxxxxx
72
+ DASHSCOPE_API_KEY=sk-xxxxxxxxxxxxx
73
+ ```
74
+
75
+ ## Commands
76
+
77
+ | Command | Description |
78
+ |---------|-------------|
79
+ | `flatfish init` | Initialize a new project |
80
+ | `flatfish process` | Run the full pipeline |
81
+ | `flatfish extract` | Extract text from images only |
82
+ | `flatfish entities` | Extract entities only |
83
+ | `flatfish summarize` | Generate AI summary only |
84
+ | `flatfish build` | Build static site only |
85
+ | `flatfish serve` | Preview site locally |
86
+ | `flatfish deploy` | Deploy to Netlify |
87
+ | `flatfish status` | Show processing status |
88
+ | `flatfish validate` | Validate configuration |
89
+
90
+ ## Deployment .
91
+
92
+ Deploy your site to Netlify:
93
+
94
+ ```bash
95
+ # Install netlify-python
96
+ pip install netlify-python
97
+
98
+ # Set your Netlify token (get from https://app.netlify.com/user/applications)
99
+ export NETLIFY_TOKEN=your-token
100
+ export NETLIFY_SITE_ID=your-site-id
101
+
102
+ # Deploy a draft preview
103
+ flatfish deploy
104
+
105
+ # Deploy to production
106
+ flatfish deploy --prod
107
+
108
+ # Specify a site ID directly
109
+ flatfish deploy --prod --site your-site-id
110
+ ```
111
+
112
+ ## Output
113
+
114
+ ```
115
+ project/
116
+ ├── transcriptions/ # Extracted text files
117
+ ├── entities/ # Entity JSON files
118
+ ├── summaries/ # AI-generated summaries
119
+ └── _site/ # Built static website
120
+ ```
121
+
122
+ ## License
123
+
124
+ MIT
125
+
126
+ ## Disclosure of Delegation to Generative AI
127
+
128
+ The authors declare the use of generative AI in the research and writing process. According to the GAIDeT taxonomy (2025), the following tasks were delegated to GAI tools under full human supervision:
129
+
130
+ - Code generation
131
+ - Code optimization
132
+
133
+ The GAI tool used was: Claude Sonnet.
134
+ Responsibility for the final manuscript lies entirely with the authors.
135
+ GAI tools are not listed as authors and do not bear responsibility for the final outcomes.
136
+ Declaration submitted by: Andrew Janco
@@ -0,0 +1,101 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "flatfish"
7
+ version = "0.1.0"
8
+ description = "Historical document analysis CLI - Extract, analyze, and present handwritten text from document images"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = "MIT"
12
+ authors = [
13
+ { name = "Andrew Janco", email = "apjanco@gmail.com" }
14
+ ]
15
+ keywords = [
16
+ "ocr",
17
+ "htr",
18
+ "handwriting",
19
+ "historical-documents",
20
+ "named-entity-recognition",
21
+ "document-analysis",
22
+ ]
23
+ classifiers = [
24
+ "Development Status :: 3 - Alpha",
25
+ "Environment :: Console",
26
+ "Intended Audience :: Science/Research",
27
+ "License :: OSI Approved :: MIT License",
28
+ "Programming Language :: Python :: 3",
29
+ "Programming Language :: Python :: 3.10",
30
+ "Programming Language :: Python :: 3.11",
31
+ "Programming Language :: Python :: 3.12",
32
+ "Topic :: Scientific/Engineering :: Image Recognition",
33
+ "Topic :: Text Processing",
34
+ ]
35
+ dependencies = [
36
+ "typer[all]>=0.12",
37
+ "pyyaml>=6.0",
38
+ "pydantic>=2.0",
39
+ "pydantic-settings>=2.0",
40
+ "python-dotenv>=1.0",
41
+ "datasets>=2.0",
42
+ "Pillow>=10.0",
43
+ "openai>=1.0",
44
+ "jinja2>=3.0",
45
+ "rich>=13.0",
46
+ "httpx>=0.25",
47
+ "tqdm>=4.0",
48
+ "markdown>=3.0",
49
+ "netlify-python>=0.4.0",
50
+ "deep-translator>=1.11.0",
51
+ ]
52
+
53
+ [project.optional-dependencies]
54
+ dev = [
55
+ "pytest>=7.0",
56
+ "pytest-cov>=4.0",
57
+ "black>=23.0",
58
+ "ruff>=0.1",
59
+ "mypy>=1.0",
60
+ "pre-commit>=3.0",
61
+ ]
62
+
63
+ [project.scripts]
64
+ flatfish = "flatfish.cli:app"
65
+
66
+ [project.urls]
67
+ Homepage = "https://github.com/PULdischo/flatfish"
68
+ Documentation = "https://github.com/PULdischo/flatfish#readme"
69
+ Repository = "https://github.com/PULdischo/flatfish"
70
+ Issues = "https://github.com/PULdischo/flatfish/issues"
71
+
72
+ [tool.hatch.build.targets.wheel]
73
+ packages = ["src/flatfish"]
74
+
75
+ [tool.hatch.build.targets.sdist]
76
+ include = [
77
+ "/src",
78
+ "/tests",
79
+ ]
80
+
81
+ [tool.black]
82
+ line-length = 100
83
+ target-version = ["py310", "py311", "py312"]
84
+
85
+ [tool.ruff]
86
+ line-length = 100
87
+ target-version = "py310"
88
+
89
+ [tool.ruff.lint]
90
+ select = ["E", "F", "W", "I", "UP", "B", "C4", "SIM"]
91
+ ignore = ["E501"]
92
+
93
+ [tool.mypy]
94
+ python_version = "3.10"
95
+ warn_return_any = true
96
+ warn_unused_ignores = true
97
+ disallow_untyped_defs = true
98
+
99
+ [tool.pytest.ini_options]
100
+ testpaths = ["tests"]
101
+ pythonpath = ["src"]
@@ -0,0 +1,3 @@
1
+ """Flatfish - Historical document analysis CLI."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,6 @@
1
+ """Entry point for python -m flatfish."""
2
+
3
+ from flatfish.cli import app
4
+
5
+ if __name__ == "__main__":
6
+ app()