pdf2data-tools 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. pdf2data_tools-0.0.1/CONTRIBUTING.rst +128 -0
  2. pdf2data_tools-0.0.1/HISTORY.rst +8 -0
  3. pdf2data_tools-0.0.1/LICENSE +16 -0
  4. pdf2data_tools-0.0.1/MANIFEST.in +11 -0
  5. pdf2data_tools-0.0.1/PKG-INFO +115 -0
  6. pdf2data_tools-0.0.1/README.md +67 -0
  7. pdf2data_tools-0.0.1/docs/Makefile +20 -0
  8. pdf2data_tools-0.0.1/docs/conf.py +160 -0
  9. pdf2data_tools-0.0.1/docs/contributing.rst +1 -0
  10. pdf2data_tools-0.0.1/docs/history.rst +1 -0
  11. pdf2data_tools-0.0.1/docs/index.rst +19 -0
  12. pdf2data_tools-0.0.1/docs/installation.rst +51 -0
  13. pdf2data_tools-0.0.1/docs/make.bat +36 -0
  14. pdf2data_tools-0.0.1/docs/readme.rst +1 -0
  15. pdf2data_tools-0.0.1/docs/usage.rst +7 -0
  16. pdf2data_tools-0.0.1/pdf2data/__init__.py +5 -0
  17. pdf2data_tools-0.0.1/pdf2data/block.py +877 -0
  18. pdf2data_tools-0.0.1/pdf2data/cli/block_extractor.py +272 -0
  19. pdf2data_tools-0.0.1/pdf2data/cli/block_finder.py +50 -0
  20. pdf2data_tools-0.0.1/pdf2data/cli/evaluator.py +45 -0
  21. pdf2data_tools-0.0.1/pdf2data/cli/metadata_finder.py +67 -0
  22. pdf2data_tools-0.0.1/pdf2data/cli/pdf2data.py +360 -0
  23. pdf2data_tools-0.0.1/pdf2data/cli/reference_extractor.py +35 -0
  24. pdf2data_tools-0.0.1/pdf2data/cli/table_detector.py +81 -0
  25. pdf2data_tools-0.0.1/pdf2data/cli/text_extractor.py +118 -0
  26. pdf2data_tools-0.0.1/pdf2data/cli/text_finder.py +70 -0
  27. pdf2data_tools-0.0.1/pdf2data/docling.py +266 -0
  28. pdf2data_tools-0.0.1/pdf2data/evaluator.py +345 -0
  29. pdf2data_tools-0.0.1/pdf2data/keywords.py +202 -0
  30. pdf2data_tools-0.0.1/pdf2data/mask.py +690 -0
  31. pdf2data_tools-0.0.1/pdf2data/metadata.py +127 -0
  32. pdf2data_tools-0.0.1/pdf2data/mineru.py +356 -0
  33. pdf2data_tools-0.0.1/pdf2data/mineru_vlm.py +236 -0
  34. pdf2data_tools-0.0.1/pdf2data/padle_pipeline.py +417 -0
  35. pdf2data_tools-0.0.1/pdf2data/pdf2data_pipeline.py +51 -0
  36. pdf2data_tools-0.0.1/pdf2data/pdf_classifier.py +46 -0
  37. pdf2data_tools-0.0.1/pdf2data/pipeline.py +231 -0
  38. pdf2data_tools-0.0.1/pdf2data/references.py +49 -0
  39. pdf2data_tools-0.0.1/pdf2data/support.py +1477 -0
  40. pdf2data_tools-0.0.1/pdf2data/text.py +315 -0
  41. pdf2data_tools-0.0.1/pdf2data_tools.egg-info/PKG-INFO +115 -0
  42. pdf2data_tools-0.0.1/pdf2data_tools.egg-info/SOURCES.txt +54 -0
  43. pdf2data_tools-0.0.1/pdf2data_tools.egg-info/dependency_links.txt +1 -0
  44. pdf2data_tools-0.0.1/pdf2data_tools.egg-info/entry_points.txt +10 -0
  45. pdf2data_tools-0.0.1/pdf2data_tools.egg-info/requires.txt +22 -0
  46. pdf2data_tools-0.0.1/pdf2data_tools.egg-info/top_level.txt +1 -0
  47. pdf2data_tools-0.0.1/pyproject.toml +76 -0
  48. pdf2data_tools-0.0.1/setup.cfg +26 -0
  49. pdf2data_tools-0.0.1/setup.py +7 -0
  50. pdf2data_tools-0.0.1/tests/__init__.py +1 -0
  51. pdf2data_tools-0.0.1/tests/block_test.py +757 -0
  52. pdf2data_tools-0.0.1/tests/mask_test.py +36 -0
  53. pdf2data_tools-0.0.1/tests/metadata_test.py +23 -0
  54. pdf2data_tools-0.0.1/tests/pdf_classifier_test.py +21 -0
  55. pdf2data_tools-0.0.1/tests/text_test.py +34 -0
@@ -0,0 +1,128 @@
1
+ .. highlight:: shell
2
+
3
+ ============
4
+ Contributing
5
+ ============
6
+
7
+ Contributions are welcome, and they are greatly appreciated! Every little bit
8
+ helps, and credit will always be given.
9
+
10
+ You can contribute in many ways:
11
+
12
+ Types of Contributions
13
+ ----------------------
14
+
15
+ Report Bugs
16
+ ~~~~~~~~~~~
17
+
18
+ Report bugs at https://github.com/pocoyo7798/pdf2data/issues.
19
+
20
+ If you are reporting a bug, please include:
21
+
22
+ * Your operating system name and version.
23
+ * Any details about your local setup that might be helpful in troubleshooting.
24
+ * Detailed steps to reproduce the bug.
25
+
26
+ Fix Bugs
27
+ ~~~~~~~~
28
+
29
+ Look through the GitHub issues for bugs. Anything tagged with "bug" and "help
30
+ wanted" is open to whoever wants to implement it.
31
+
32
+ Implement Features
33
+ ~~~~~~~~~~~~~~~~~~
34
+
35
+ Look through the GitHub issues for features. Anything tagged with "enhancement"
36
+ and "help wanted" is open to whoever wants to implement it.
37
+
38
+ Write Documentation
39
+ ~~~~~~~~~~~~~~~~~~~
40
+
41
+ pdf2data could always use more documentation, whether as part of the
42
+ official pdf2data docs, in docstrings, or even on the web in blog posts,
43
+ articles, and such.
44
+
45
+ Submit Feedback
46
+ ~~~~~~~~~~~~~~~
47
+
48
+ The best way to send feedback is to file an issue at https://github.com/pocoyo7798/pdf2data/issues.
49
+
50
+ If you are proposing a feature:
51
+
52
+ * Explain in detail how it would work.
53
+ * Keep the scope as narrow as possible, to make it easier to implement.
54
+ * Remember that this is a volunteer-driven project, and that contributions
55
+ are welcome :)
56
+
57
+ Get Started!
58
+ ------------
59
+
60
+ Ready to contribute? Here's how to set up `pdf2data` for local development.
61
+
62
+ 1. Fork the `pdf2data` repo on GitHub.
63
+ 2. Clone your fork locally::
64
+
65
+ $ git clone git@github.com:your_name_here/pdf2data.git
66
+
67
+ 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development::
68
+
69
+ $ mkvirtualenv pdf2data
70
+ $ cd pdf2data/
71
+ $ python setup.py develop
72
+
73
+ 4. Create a branch for local development::
74
+
75
+ $ git checkout -b name-of-your-bugfix-or-feature
76
+
77
+ Now you can make your changes locally.
78
+
79
+ 5. When you're done making changes, check that your changes pass flake8 and the
80
+ tests, including testing other Python versions with tox::
81
+
82
+ $ flake8 pdf2data tests
83
+ $ python setup.py test or pytest
84
+ $ tox
85
+
86
+ To get flake8 and tox, just pip install them into your virtualenv.
87
+
88
+ 6. Commit your changes and push your branch to GitHub::
89
+
90
+ $ git add .
91
+ $ git commit -m "Your detailed description of your changes."
92
+ $ git push origin name-of-your-bugfix-or-feature
93
+
94
+ 7. Submit a pull request through the GitHub website.
95
+
96
+ Pull Request Guidelines
97
+ -----------------------
98
+
99
+ Before you submit a pull request, check that it meets these guidelines:
100
+
101
+ 1. The pull request should include tests.
102
+ 2. If the pull request adds functionality, the docs should be updated. Put
103
+ your new functionality into a function with a docstring, and add the
104
+ feature to the list in README.rst.
105
+ 3. The pull request should work for Python 3.5, 3.6, 3.7 and 3.8, and for PyPy. Check
106
+ https://travis-ci.com/pocoyo7798/pdf2data/pull_requests
107
+ and make sure that the tests pass for all supported Python versions.
108
+
109
+ Tips
110
+ ----
111
+
112
+ To run a subset of tests::
113
+
114
+ $ pytest tests.test_pdf2data
115
+
116
+
117
+ Deploying
118
+ ---------
119
+
120
+ A reminder for the maintainers on how to deploy.
121
+ Make sure all your changes are committed (including an entry in HISTORY.rst).
122
+ Then run::
123
+
124
+ $ bump2version patch # possible: major / minor / patch
125
+ $ git push
126
+ $ git push --tags
127
+
128
+ Travis will then deploy to PyPI if tests pass.
@@ -0,0 +1,8 @@
1
+ =======
2
+ History
3
+ =======
4
+
5
+ 0.0.1 (2024-01-11)
6
+ ------------------
7
+
8
+ * First release on PyPI.
@@ -0,0 +1,16 @@
1
+ Apache Software License 2.0
2
+
3
+ Copyright (c) 2024, Daniel Pereira Costa
4
+
5
+ Licensed under the Apache License, Version 2.0 (the "License");
6
+ you may not use this file except in compliance with the License.
7
+ You may obtain a copy of the License at
8
+
9
+ http://www.apache.org/licenses/LICENSE-2.0
10
+
11
+ Unless required by applicable law or agreed to in writing, software
12
+ distributed under the License is distributed on an "AS IS" BASIS,
13
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ See the License for the specific language governing permissions and
15
+ limitations under the License.
16
+
@@ -0,0 +1,11 @@
1
+ include CONTRIBUTING.rst
2
+ include HISTORY.rst
3
+ include LICENSE
4
+ include README.md
5
+ include pyproject.toml
6
+
7
+ recursive-include tests *
8
+ recursive-exclude * __pycache__
9
+ recursive-exclude * *.py[co]
10
+
11
+ recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif
@@ -0,0 +1,115 @@
1
+ Metadata-Version: 2.4
2
+ Name: pdf2data-tools
3
+ Version: 0.0.1
4
+ Summary: Transforms PDF files into machine readable JSON files
5
+ Author-email: Daniel Pereira Costa <daniel.pereira.costa@tecnico.ulisboa.pt>
6
+ License: Apache-2.0
7
+ Project-URL: Homepage, https://github.com/pocoyo7798/pdf2data
8
+ Project-URL: Repository, https://github.com/pocoyo7798/pdf2data
9
+ Project-URL: Issues, https://github.com/pocoyo7798/pdf2data/issues
10
+ Keywords: pdf,data-extraction,json,tables,figures,document-analysis
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: Apache Software License
15
+ Classifier: Natural Language :: English
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.8
18
+ Classifier: Programming Language :: Python :: 3.9
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Topic :: Scientific/Engineering
23
+ Classifier: Topic :: Text Processing
24
+ Requires-Python: >=3.8
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ Requires-Dist: Click<=8.3.1
28
+ Requires-Dist: PyMuPDF<=1.26.7
29
+ Requires-Dist: pylatexenc<=2.10
30
+ Requires-Dist: pydantic<=2.12.5
31
+ Requires-Dist: beautifulsoup4<=4.14.3
32
+ Requires-Dist: pdf2doi<=1.7
33
+ Requires-Dist: Levenshtein<=0.27.3
34
+ Requires-Dist: trieregex<=1.0.0
35
+ Requires-Dist: bibtexparser<=1.4.3
36
+ Requires-Dist: pypdf>=3.1.0
37
+ Provides-Extra: test
38
+ Requires-Dist: pytest>=3; extra == "test"
39
+ Provides-Extra: pdf2data-pipeline
40
+ Requires-Dist: torch<=2.10.0; extra == "pdf2data-pipeline"
41
+ Requires-Dist: opencv-python<=4.13.0.92; extra == "pdf2data-pipeline"
42
+ Requires-Dist: tensorflow<=2.20.0; extra == "pdf2data-pipeline"
43
+ Requires-Dist: doclayout_yolo<=0.0.4; extra == "pdf2data-pipeline"
44
+ Requires-Dist: pdf2image<=1.17.0; extra == "pdf2data-pipeline"
45
+ Requires-Dist: paddleocr<=3.4.0; extra == "pdf2data-pipeline"
46
+ Requires-Dist: paddlepaddle<=3.3.0; extra == "pdf2data-pipeline"
47
+ Dynamic: license-file
48
+
49
+ # pdf2data
50
+
51
+ [![PyPI version](https://badge.fury.io/py/pdf2data-tools.svg)](https://pypi.org/project/pdf2data-tools/)
52
+ [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
53
+
54
+ Transforms PDF files into machine-readable JSON files. Extracts tables, figures, text blocks, metadata, and references from scientific papers and documents.
55
+
56
+ > **Note:** The repository is under active development for an article publication. Some errors are expected. Please report any issues on the [issues page](https://github.com/Pocoyo7798/pdf2data/issues).
57
+
58
+ ## Installation
59
+
60
+ ### From PyPI (recommended)
61
+
62
+ ```bash
63
+ pip install pdf2data-tools
64
+ ```
65
+
66
+ ### With optional dependencies
67
+
68
+ ```bash
69
+ # For the full PDF2Data pipeline (layout detection, OCR, etc.)
70
+ pip install pdf2data-tools[pdf2data_pipeline]
71
+ ```
72
+
73
+ ### From source (development)
74
+
75
+ ```bash
76
+ conda create --name pdf2data python=3.10
77
+ conda activate pdf2data
78
+ git clone git@github.com:Pocoyo7798/pdf2data.git
79
+ cd pdf2data
80
+ pip install -e .
81
+ ```
82
+
83
+ ## Usage
84
+
85
+ ### As a library
86
+
87
+ ```python
88
+ from pdf2data.pdf2data_pipeline import PDF2Data
89
+
90
+ pipeline = PDF2Data(
91
+ layout_model="DocLayout-YOLO-DocStructBench",
92
+ input_folder="path/to/pdfs",
93
+ output_folder="path/to/results",
94
+ )
95
+ ```
96
+
97
+ ### Command line
98
+
99
+ ```bash
100
+ # Extract tables and figures
101
+ pdf2data_block path_to_folder path_to_results
102
+
103
+ # Extract text
104
+ pdf2data_text path_to_folder path_to_results
105
+
106
+ # Extract metadata
107
+ pdf2data_metadata path_to_folder path_to_results
108
+
109
+ # Extract references
110
+ pdf2data_references path_to_folder path_to_results
111
+ ```
112
+
113
+ ## License
114
+
115
+ Apache Software License 2.0
@@ -0,0 +1,67 @@
1
+ # pdf2data
2
+
3
+ [![PyPI version](https://badge.fury.io/py/pdf2data-tools.svg)](https://pypi.org/project/pdf2data-tools/)
4
+ [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
5
+
6
+ Transforms PDF files into machine-readable JSON files. Extracts tables, figures, text blocks, metadata, and references from scientific papers and documents.
7
+
8
+ > **Note:** The repository is under active development for an article publication. Some errors are expected. Please report any issues on the [issues page](https://github.com/Pocoyo7798/pdf2data/issues).
9
+
10
+ ## Installation
11
+
12
+ ### From PyPI (recommended)
13
+
14
+ ```bash
15
+ pip install pdf2data-tools
16
+ ```
17
+
18
+ ### With optional dependencies
19
+
20
+ ```bash
21
+ # For the full PDF2Data pipeline (layout detection, OCR, etc.)
22
+ pip install pdf2data-tools[pdf2data_pipeline]
23
+ ```
24
+
25
+ ### From source (development)
26
+
27
+ ```bash
28
+ conda create --name pdf2data python=3.10
29
+ conda activate pdf2data
30
+ git clone git@github.com:Pocoyo7798/pdf2data.git
31
+ cd pdf2data
32
+ pip install -e .
33
+ ```
34
+
35
+ ## Usage
36
+
37
+ ### As a library
38
+
39
+ ```python
40
+ from pdf2data.pdf2data_pipeline import PDF2Data
41
+
42
+ pipeline = PDF2Data(
43
+ layout_model="DocLayout-YOLO-DocStructBench",
44
+ input_folder="path/to/pdfs",
45
+ output_folder="path/to/results",
46
+ )
47
+ ```
48
+
49
+ ### Command line
50
+
51
+ ```bash
52
+ # Extract tables and figures
53
+ pdf2data_block path_to_folder path_to_results
54
+
55
+ # Extract text
56
+ pdf2data_text path_to_folder path_to_results
57
+
58
+ # Extract metadata
59
+ pdf2data_metadata path_to_folder path_to_results
60
+
61
+ # Extract references
62
+ pdf2data_references path_to_folder path_to_results
63
+ ```
64
+
65
+ ## License
66
+
67
+ Apache Software License 2.0
@@ -0,0 +1,20 @@
1
+ # Minimal makefile for Sphinx documentation
2
+ #
3
+
4
+ # You can set these variables from the command line.
5
+ SPHINXOPTS =
6
+ SPHINXBUILD = python -msphinx
7
+ SPHINXPROJ = pdf2data
8
+ SOURCEDIR = .
9
+ BUILDDIR = _build
10
+
11
+ # Put it first so that "make" without argument is like "make help".
12
+ help:
13
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14
+
15
+ .PHONY: help Makefile
16
+
17
+ # Catch-all target: route all unknown targets to Sphinx using the new
18
+ # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19
+ %: Makefile
20
+ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
@@ -0,0 +1,160 @@
1
+ #!/usr/bin/env python
2
+ #
3
+ # pdf2data documentation build configuration file, created by
4
+ # sphinx-quickstart on Fri Jun 9 13:47:02 2017.
5
+ #
6
+ # This file is execfile()d with the current directory set to its
7
+ # containing dir.
8
+ #
9
+ # Note that not all possible configuration values are present in this
10
+ # autogenerated file.
11
+ #
12
+ # All configuration values have a default; values that are commented out
13
+ # serve to show the default.
14
+
15
+ # If extensions (or modules to document with autodoc) are in another
16
+ # directory, add these directories to sys.path here. If the directory is
17
+ # relative to the documentation root, use os.path.abspath to make it
18
+ # absolute, like shown here.
19
+ #
20
+ import os
21
+ import sys
22
+
23
+ sys.path.insert(0, os.path.abspath(".."))
24
+
25
+ import pdf2data
26
+
27
+ # -- General configuration ---------------------------------------------
28
+
29
+ # If your documentation needs a minimal Sphinx version, state it here.
30
+ #
31
+ # needs_sphinx = '1.0'
32
+
33
+ # Add any Sphinx extension module names here, as strings. They can be
34
+ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
35
+ extensions = ["sphinx.ext.autodoc", "sphinx.ext.viewcode"]
36
+
37
+ # Add any paths that contain templates here, relative to this directory.
38
+ templates_path = ["_templates"]
39
+
40
+ # The suffix(es) of source filenames.
41
+ # You can specify multiple suffix as a list of string:
42
+ #
43
+ # source_suffix = ['.rst', '.md']
44
+ source_suffix = ".rst"
45
+
46
+ # The master toctree document.
47
+ master_doc = "index"
48
+
49
+ # General information about the project.
50
+ project = "pdf2data"
51
+ copyright = "2024, Daniel Pereira Costa"
52
+ author = "Daniel Pereira Costa"
53
+
54
+ # The version info for the project you're documenting, acts as replacement
55
+ # for |version| and |release|, also used in various other places throughout
56
+ # the built documents.
57
+ #
58
+ # The short X.Y version.
59
+ version = pdf2data.__version__
60
+ # The full version, including alpha/beta/rc tags.
61
+ release = pdf2data.__version__
62
+
63
+ # The language for content autogenerated by Sphinx. Refer to documentation
64
+ # for a list of supported languages.
65
+ #
66
+ # This is also used if you do content translation via gettext catalogs.
67
+ # Usually you set "language" from the command line for these cases.
68
+ language = None
69
+
70
+ # List of patterns, relative to source directory, that match files and
71
+ # directories to ignore when looking for source files.
72
+ # This patterns also effect to html_static_path and html_extra_path
73
+ exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
74
+
75
+ # The name of the Pygments (syntax highlighting) style to use.
76
+ pygments_style = "sphinx"
77
+
78
+ # If true, `todo` and `todoList` produce output, else they produce nothing.
79
+ todo_include_todos = False
80
+
81
+
82
+ # -- Options for HTML output -------------------------------------------
83
+
84
+ # The theme to use for HTML and HTML Help pages. See the documentation for
85
+ # a list of builtin themes.
86
+ #
87
+ html_theme = "alabaster"
88
+
89
+ # Theme options are theme-specific and customize the look and feel of a
90
+ # theme further. For a list of options available for each theme, see the
91
+ # documentation.
92
+ #
93
+ # html_theme_options = {}
94
+
95
+ # Add any paths that contain custom static files (such as style sheets) here,
96
+ # relative to this directory. They are copied after the builtin static files,
97
+ # so a file named "default.css" will overwrite the builtin "default.css".
98
+ html_static_path = ["_static"]
99
+
100
+
101
+ # -- Options for HTMLHelp output ---------------------------------------
102
+
103
+ # Output file base name for HTML help builder.
104
+ htmlhelp_basename = "pdf2datadoc"
105
+
106
+
107
+ # -- Options for LaTeX output ------------------------------------------
108
+
109
+ latex_elements = {
110
+ # The paper size ('letterpaper' or 'a4paper').
111
+ #
112
+ # 'papersize': 'letterpaper',
113
+ # The font size ('10pt', '11pt' or '12pt').
114
+ #
115
+ # 'pointsize': '10pt',
116
+ # Additional stuff for the LaTeX preamble.
117
+ #
118
+ # 'preamble': '',
119
+ # Latex figure (float) alignment
120
+ #
121
+ # 'figure_align': 'htbp',
122
+ }
123
+
124
+ # Grouping the document tree into LaTeX files. List of tuples
125
+ # (source start file, target name, title, author, documentclass
126
+ # [howto, manual, or own class]).
127
+ latex_documents = [
128
+ (
129
+ master_doc,
130
+ "pdf2data.tex",
131
+ "pdf2data Documentation",
132
+ "Daniel Pereira Costa",
133
+ "manual",
134
+ ),
135
+ ]
136
+
137
+
138
+ # -- Options for manual page output ------------------------------------
139
+
140
+ # One entry per manual page. List of tuples
141
+ # (source start file, name, description, authors, manual section).
142
+ man_pages = [(master_doc, "pdf2data", "pdf2data Documentation", [author], 1)]
143
+
144
+
145
+ # -- Options for Texinfo output ----------------------------------------
146
+
147
+ # Grouping the document tree into Texinfo files. List of tuples
148
+ # (source start file, target name, title, author,
149
+ # dir menu entry, description, category)
150
+ texinfo_documents = [
151
+ (
152
+ master_doc,
153
+ "pdf2data",
154
+ "pdf2data Documentation",
155
+ author,
156
+ "pdf2data",
157
+ "One line description of project.",
158
+ "Miscellaneous",
159
+ ),
160
+ ]
@@ -0,0 +1 @@
1
+ .. include:: ../CONTRIBUTING.rst
@@ -0,0 +1 @@
1
+ .. include:: ../HISTORY.rst
@@ -0,0 +1,19 @@
1
+ Welcome to pdf2data's documentation!
2
+ ======================================
3
+
4
+ .. toctree::
5
+ :maxdepth: 2
6
+ :caption: Contents:
7
+
8
+ readme
9
+ installation
10
+ usage
11
+ modules
12
+ contributing
13
+ history
14
+
15
+ Indices and tables
16
+ ==================
17
+ * :ref:`genindex`
18
+ * :ref:`modindex`
19
+ * :ref:`search`
@@ -0,0 +1,51 @@
1
+ .. highlight:: shell
2
+
3
+ ============
4
+ Installation
5
+ ============
6
+
7
+
8
+ Stable release
9
+ --------------
10
+
11
+ To install pdf2data, run this command in your terminal:
12
+
13
+ .. code-block:: console
14
+
15
+ $ pip install pdf2data
16
+
17
+ This is the preferred method to install pdf2data, as it will always install the most recent stable release.
18
+
19
+ If you don't have `pip`_ installed, this `Python installation guide`_ can guide
20
+ you through the process.
21
+
22
+ .. _pip: https://pip.pypa.io
23
+ .. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/
24
+
25
+
26
+ From sources
27
+ ------------
28
+
29
+ The sources for pdf2data can be downloaded from the `Github repo`_.
30
+
31
+ You can either clone the public repository:
32
+
33
+ .. code-block:: console
34
+
35
+ $ git clone git://github.com/pocoyo7798/pdf2data
36
+
37
+ Or download the `tarball`_:
38
+
39
+ .. code-block:: console
40
+
41
+ $ curl -OJL https://github.com/pocoyo7798/pdf2data/tarball/master
42
+
43
+ Once you have a copy of the source, you can install it with:
44
+
45
+ .. code-block:: console
46
+
47
+ $ python setup.py install
48
+
49
+
50
+ .. _Github repo: https://github.com/pocoyo7798/pdf2data
51
+ .. _tarball: https://github.com/pocoyo7798/pdf2data/tarball/master
@@ -0,0 +1,36 @@
1
+ @ECHO OFF
2
+
3
+ pushd %~dp0
4
+
5
+ REM Command file for Sphinx documentation
6
+
7
+ if "%SPHINXBUILD%" == "" (
8
+ set SPHINXBUILD=python -msphinx
9
+ )
10
+ set SOURCEDIR=.
11
+ set BUILDDIR=_build
12
+ set SPHINXPROJ=pdf2data
13
+
14
+ if "%1" == "" goto help
15
+
16
+ %SPHINXBUILD% >NUL 2>NUL
17
+ if errorlevel 9009 (
18
+ echo.
19
+ echo.The Sphinx module was not found. Make sure you have Sphinx installed,
20
+ echo.then set the SPHINXBUILD environment variable to point to the full
21
+ echo.path of the 'sphinx-build' executable. Alternatively you may add the
22
+ echo.Sphinx directory to PATH.
23
+ echo.
24
+ echo.If you don't have Sphinx installed, grab it from
25
+ echo.http://sphinx-doc.org/
26
+ exit /b 1
27
+ )
28
+
29
+ %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30
+ goto end
31
+
32
+ :help
33
+ %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34
+
35
+ :end
36
+ popd
@@ -0,0 +1 @@
1
+ .. include:: ../README.rst
@@ -0,0 +1,7 @@
1
+ =====
2
+ Usage
3
+ =====
4
+
5
+ To use pdf2data in a project::
6
+
7
+ import pdf2data
@@ -0,0 +1,5 @@
1
+ """Top-level package for pdf2data."""
2
+
3
+ __author__ = """Daniel Pereira Costa"""
4
+ __email__ = "daniel.pereira.costa@tecnico.ulisboa.pt"
5
+ __version__ = "0.0.1"