inscriptis 2.6.0__tar.gz → 2.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (218) hide show
  1. inscriptis-2.7.0/.coveragerc +2 -0
  2. inscriptis-2.7.0/.git-blame-ignore-revs +2 -0
  3. inscriptis-2.7.0/.github/dependabot.yml +18 -0
  4. inscriptis-2.7.0/.github/workflows/codeql-analysis.yml +68 -0
  5. inscriptis-2.7.0/.github/workflows/create-container.yml +35 -0
  6. inscriptis-2.7.0/.github/workflows/helm-release.yaml +20 -0
  7. inscriptis-2.7.0/.github/workflows/python-package.yml +29 -0
  8. inscriptis-2.7.0/.gitignore +30 -0
  9. inscriptis-2.7.0/.readthedocs.yaml +35 -0
  10. inscriptis-2.7.0/.safety-project.ini +5 -0
  11. inscriptis-2.7.0/CONTRIBUTING.md +54 -0
  12. inscriptis-2.7.0/Dockerfile +24 -0
  13. {inscriptis-2.6.0 → inscriptis-2.7.0}/PKG-INFO +116 -99
  14. {inscriptis-2.6.0 → inscriptis-2.7.0}/README.rst +101 -83
  15. inscriptis-2.7.0/RENDERING.md +244 -0
  16. inscriptis-2.7.0/TODO.txt +5 -0
  17. inscriptis-2.7.0/benchmarking/a +113 -0
  18. inscriptis-2.7.0/benchmarking/b +3 -0
  19. inscriptis-2.7.0/benchmarking/run_benchmarking.py +369 -0
  20. inscriptis-2.7.0/benchmarking/speed_comparisons.txt +113 -0
  21. inscriptis-2.7.0/benchmarking/url_list.txt +18 -0
  22. inscriptis-2.7.0/docker-compose.yml +14 -0
  23. inscriptis-2.7.0/docs/Makefile +20 -0
  24. inscriptis-2.7.0/docs/README.rst +1 -0
  25. inscriptis-2.7.0/docs/api.rst +76 -0
  26. inscriptis-2.7.0/docs/benchmarking.rst +57 -0
  27. inscriptis-2.7.0/docs/conf.py +201 -0
  28. inscriptis-2.7.0/docs/contributing.md +1 -0
  29. inscriptis-2.7.0/docs/images/stackoverflow-code-annotation.png +0 -0
  30. inscriptis-2.7.0/docs/images/wikipedia-chur-entry-annotation.png +0 -0
  31. inscriptis-2.7.0/docs/images/wikipedia-chur-table-annotation.png +0 -0
  32. inscriptis-2.7.0/docs/images/xda-posts-annotation.png +0 -0
  33. inscriptis-2.7.0/docs/index.rst +29 -0
  34. inscriptis-2.7.0/docs/paper/Makefile +6 -0
  35. inscriptis-2.7.0/docs/paper/images/annotations.png +0 -0
  36. inscriptis-2.7.0/docs/paper/images/inscriptis-vs-lynx.png +0 -0
  37. inscriptis-2.7.0/docs/paper/images/inscriptis-vs-lynx.xcf +0 -0
  38. inscriptis-2.7.0/docs/paper/images/raw/inscriptis.png +0 -0
  39. inscriptis-2.7.0/docs/paper/images/raw/lynx.png +0 -0
  40. inscriptis-2.7.0/docs/paper/paper.bib +515 -0
  41. inscriptis-2.7.0/docs/paper/paper.md +82 -0
  42. inscriptis-2.7.0/docs/requirements.txt +4 -0
  43. inscriptis-2.7.0/examples/annotation/annotation-profile.json +14 -0
  44. inscriptis-2.7.0/examples/annotation/stackoverflow.json +14 -0
  45. inscriptis-2.7.0/examples/annotation/table-annotation-profile.json +7 -0
  46. inscriptis-2.7.0/examples/annotation/unittest.json +7 -0
  47. inscriptis-2.7.0/examples/annotation/wikipedia-entities-and-citations.json +5 -0
  48. inscriptis-2.7.0/examples/annotation/wikipedia.json +12 -0
  49. inscriptis-2.7.0/examples/annotation/xda-developers.json +6 -0
  50. inscriptis-2.7.0/examples/custom-html-handling.py +41 -0
  51. inscriptis-2.7.0/img/nested-table-firefox.png +0 -0
  52. inscriptis-2.7.0/img/wikipedia-chur-firefox.png +0 -0
  53. inscriptis-2.7.0/img/wikipedia-python-example.png +0 -0
  54. {inscriptis-2.6.0/src → inscriptis-2.7.0}/inscriptis/__init__.py +16 -11
  55. {inscriptis-2.6.0/src → inscriptis-2.7.0}/inscriptis/annotation/__init__.py +4 -6
  56. {inscriptis-2.6.0/src → inscriptis-2.7.0}/inscriptis/annotation/output/__init__.py +3 -2
  57. {inscriptis-2.6.0/src → inscriptis-2.7.0}/inscriptis/annotation/output/html.py +16 -16
  58. {inscriptis-2.6.0/src → inscriptis-2.7.0}/inscriptis/annotation/output/surface.py +6 -8
  59. {inscriptis-2.6.0/src → inscriptis-2.7.0}/inscriptis/annotation/output/xml.py +2 -2
  60. {inscriptis-2.6.0/src → inscriptis-2.7.0}/inscriptis/annotation/parser.py +9 -6
  61. {inscriptis-2.6.0/src → inscriptis-2.7.0}/inscriptis/cli/inscript.py +25 -46
  62. {inscriptis-2.6.0/src → inscriptis-2.7.0}/inscriptis/css_profiles.py +8 -14
  63. {inscriptis-2.6.0/src → inscriptis-2.7.0}/inscriptis/html_engine.py +28 -26
  64. {inscriptis-2.6.0/src → inscriptis-2.7.0}/inscriptis/metadata.py +2 -3
  65. {inscriptis-2.6.0/src → inscriptis-2.7.0}/inscriptis/model/attribute.py +7 -11
  66. {inscriptis-2.6.0/src → inscriptis-2.7.0}/inscriptis/model/canvas/__init__.py +8 -6
  67. {inscriptis-2.6.0/src → inscriptis-2.7.0}/inscriptis/model/canvas/block.py +8 -7
  68. {inscriptis-2.6.0/src → inscriptis-2.7.0}/inscriptis/model/canvas/prefix.py +7 -7
  69. {inscriptis-2.6.0/src → inscriptis-2.7.0}/inscriptis/model/config.py +52 -8
  70. {inscriptis-2.6.0/src → inscriptis-2.7.0}/inscriptis/model/css.py +7 -6
  71. {inscriptis-2.6.0/src → inscriptis-2.7.0}/inscriptis/model/html_document_state.py +10 -5
  72. {inscriptis-2.6.0/src → inscriptis-2.7.0}/inscriptis/model/html_element.py +39 -31
  73. {inscriptis-2.6.0/src → inscriptis-2.7.0}/inscriptis/model/table.py +33 -50
  74. {inscriptis-2.6.0/src → inscriptis-2.7.0}/inscriptis/model/tag/__init__.py +7 -4
  75. {inscriptis-2.6.0/src → inscriptis-2.7.0}/inscriptis/model/tag/a_tag.py +1 -2
  76. {inscriptis-2.6.0/src → inscriptis-2.7.0}/inscriptis/model/tag/br_tag.py +1 -2
  77. {inscriptis-2.6.0/src → inscriptis-2.7.0}/inscriptis/model/tag/img_tag.py +2 -5
  78. {inscriptis-2.6.0/src → inscriptis-2.7.0}/inscriptis/model/tag/list_tag.py +8 -4
  79. {inscriptis-2.6.0/src → inscriptis-2.7.0}/inscriptis/model/tag/table_tag.py +5 -8
  80. {inscriptis-2.6.0/src → inscriptis-2.7.0}/inscriptis/service/web.py +6 -10
  81. inscriptis-2.7.0/publish.sh +38 -0
  82. inscriptis-2.7.0/pyproject.toml +137 -0
  83. inscriptis-2.7.0/tests/__init__.py +0 -0
  84. inscriptis-2.7.0/tests/data/annotation-profile-unittest.json +7 -0
  85. inscriptis-2.7.0/tests/html/advanced-prefix-test.html +19 -0
  86. inscriptis-2.7.0/tests/html/advanced-prefix-test.txt +13 -0
  87. inscriptis-2.7.0/tests/html/br-in-table.html +10 -0
  88. inscriptis-2.7.0/tests/html/br-in-table.txt +3 -0
  89. inscriptis-2.7.0/tests/html/br-in-table2.html +22 -0
  90. inscriptis-2.7.0/tests/html/br-li.html +6 -0
  91. inscriptis-2.7.0/tests/html/br-li.txt +4 -0
  92. inscriptis-2.7.0/tests/html/br.html +2 -0
  93. inscriptis-2.7.0/tests/html/br.txt +2 -0
  94. inscriptis-2.7.0/tests/html/direct-enumeration.html +13 -0
  95. inscriptis-2.7.0/tests/html/direct-enumeration.txt +7 -0
  96. inscriptis-2.7.0/tests/html/empty-table.html +3 -0
  97. inscriptis-2.7.0/tests/html/empty-table.txt +1 -0
  98. inscriptis-2.7.0/tests/html/enumeration-multiple-value.html +8 -0
  99. inscriptis-2.7.0/tests/html/enumeration-multiple-value.txt +6 -0
  100. inscriptis-2.7.0/tests/html/enumeration-value.html +8 -0
  101. inscriptis-2.7.0/tests/html/enumeration-value.txt +6 -0
  102. inscriptis-2.7.0/tests/html/enumerations.html +14 -0
  103. inscriptis-2.7.0/tests/html/enumerations.txt +8 -0
  104. inscriptis-2.7.0/tests/html/html-comment-ofuscation.html +1 -0
  105. inscriptis-2.7.0/tests/html/html-comment-ofuscation.txt +1 -0
  106. inscriptis-2.7.0/tests/html/invalid-table.html +14 -0
  107. inscriptis-2.7.0/tests/html/invalid-table.txt +3 -0
  108. inscriptis-2.7.0/tests/html/invalid-table2.html +10 -0
  109. inscriptis-2.7.0/tests/html/invalid-table2.txt +4 -0
  110. inscriptis-2.7.0/tests/html/invalid-table3.html +10 -0
  111. inscriptis-2.7.0/tests/html/invalid-table3.txt +4 -0
  112. inscriptis-2.7.0/tests/html/invisible.html +4 -0
  113. inscriptis-2.7.0/tests/html/invisible.txt +1 -0
  114. inscriptis-2.7.0/tests/html/invisible2.html +2 -0
  115. inscriptis-2.7.0/tests/html/invisible2.txt +1 -0
  116. inscriptis-2.7.0/tests/html/invisible3.html +2 -0
  117. inscriptis-2.7.0/tests/html/invisible3.txt +0 -0
  118. inscriptis-2.7.0/tests/html/nested-list.html +36 -0
  119. inscriptis-2.7.0/tests/html/nested-list.txt +21 -0
  120. inscriptis-2.7.0/tests/html/nested-table-alignment-css.html +23 -0
  121. inscriptis-2.7.0/tests/html/nested-table-alignment-css.txt +7 -0
  122. inscriptis-2.7.0/tests/html/nested-table-alignment.html +23 -0
  123. inscriptis-2.7.0/tests/html/nested-table-alignment.txt +7 -0
  124. inscriptis-2.7.0/tests/html/nested-table.html +19 -0
  125. inscriptis-2.7.0/tests/html/nested-table.txt +5 -0
  126. inscriptis-2.7.0/tests/html/p-br.html +9 -0
  127. inscriptis-2.7.0/tests/html/p-br.txt +11 -0
  128. inscriptis-2.7.0/tests/html/pre.html +22 -0
  129. inscriptis-2.7.0/tests/html/pre.txt +20 -0
  130. inscriptis-2.7.0/tests/html/real-world/avantec-team.html +903 -0
  131. inscriptis-2.7.0/tests/html/real-world/naturgruen-team.html +177 -0
  132. inscriptis-2.7.0/tests/html/real-world/rswag-mitarbeiter.html +811 -0
  133. inscriptis-2.7.0/tests/html/stackoverflow-list-snippet.html +34 -0
  134. inscriptis-2.7.0/tests/html/stackoverflow-list-snippet.txt +2 -0
  135. inscriptis-2.7.0/tests/html/subsequent-headings.html +24 -0
  136. inscriptis-2.7.0/tests/html/subsequent-headings.json +18 -0
  137. inscriptis-2.7.0/tests/html/subsequent-headings.txt +21 -0
  138. inscriptis-2.7.0/tests/html/table-alignment.html +8 -0
  139. inscriptis-2.7.0/tests/html/table-alignment.txt +4 -0
  140. inscriptis-2.7.0/tests/html/table-empty-row.html +9 -0
  141. inscriptis-2.7.0/tests/html/table-empty-row.txt +5 -0
  142. inscriptis-2.7.0/tests/html/table-in-table.html +48 -0
  143. inscriptis-2.7.0/tests/html/table-in-table.json +25 -0
  144. inscriptis-2.7.0/tests/html/table-in-table.txt +29 -0
  145. inscriptis-2.7.0/tests/html/table-itemize.html +7 -0
  146. inscriptis-2.7.0/tests/html/table-itemize.txt +4 -0
  147. inscriptis-2.7.0/tests/html/table-pre.html +36 -0
  148. inscriptis-2.7.0/tests/html/table-pre.txt +12 -0
  149. inscriptis-2.7.0/tests/html/table.html +10 -0
  150. inscriptis-2.7.0/tests/html/table.json +20 -0
  151. inscriptis-2.7.0/tests/html/table.txt +2 -0
  152. inscriptis-2.7.0/tests/html/td-only-table.html +5 -0
  153. inscriptis-2.7.0/tests/html/td-only-table.txt +1 -0
  154. inscriptis-2.7.0/tests/html/test.html +123 -0
  155. inscriptis-2.7.0/tests/html/tr-only-table.html +5 -0
  156. inscriptis-2.7.0/tests/html/tr-only-table.txt +3 -0
  157. inscriptis-2.7.0/tests/html/whitespace.html +9 -0
  158. inscriptis-2.7.0/tests/html/whitespace.txt +4 -0
  159. inscriptis-2.7.0/tests/html/wikipedia-code.html +14 -0
  160. inscriptis-2.7.0/tests/html/wikipedia-code.txt +16 -0
  161. inscriptis-2.7.0/tests/html/wikipedia-consequtive-links-and-umlauts.html +16 -0
  162. inscriptis-2.7.0/tests/html/wikipedia-consequtive-links-and-umlauts.txt +1 -0
  163. inscriptis-2.7.0/tests/html/wikipedia-consequtive-tables.html +243 -0
  164. inscriptis-2.7.0/tests/html/wikipedia-consequtive-tables.json +32 -0
  165. inscriptis-2.7.0/tests/html/wikipedia-enumeration-annotation.html +66 -0
  166. inscriptis-2.7.0/tests/html/wikipedia-enumeration-annotation.json +19 -0
  167. inscriptis-2.7.0/tests/html/wikipedia-enumeration-annotation.txt +45 -0
  168. inscriptis-2.7.0/tests/html/wikipedia-enumeration.html +61 -0
  169. inscriptis-2.7.0/tests/html/wikipedia-enumeration.txt +39 -0
  170. inscriptis-2.7.0/tests/html/wikipedia-equation.html +10 -0
  171. inscriptis-2.7.0/tests/html/wikipedia-equation.txt +7 -0
  172. inscriptis-2.7.0/tests/html/wikipedia-table-bordercase-verticial-alignmnet.html +28 -0
  173. inscriptis-2.7.0/tests/html/wikipedia-table-bordercase-verticial-alignmnet.json +31 -0
  174. inscriptis-2.7.0/tests/html/wikipedia-table-bordercase1.html +21 -0
  175. inscriptis-2.7.0/tests/html/wikipedia-table-bordercase1.json +21 -0
  176. inscriptis-2.7.0/tests/html/wikipedia-table.html +33 -0
  177. inscriptis-2.7.0/tests/html/wikipedia-table.json +33 -0
  178. inscriptis-2.7.0/tests/html/wikipedia-table.txt +9 -0
  179. inscriptis-2.7.0/tests/test_annotation.py +70 -0
  180. inscriptis-2.7.0/tests/test_annotation_engine.py +22 -0
  181. inscriptis-2.7.0/tests/test_annotation_output_processor.py +85 -0
  182. inscriptis-2.7.0/tests/test_annotation_output_xml.py +73 -0
  183. inscriptis-2.7.0/tests/test_annotation_rule_parsing.py +74 -0
  184. inscriptis-2.7.0/tests/test_block.py +66 -0
  185. inscriptis-2.7.0/tests/test_broken_table_handling.py +23 -0
  186. inscriptis-2.7.0/tests/test_cli.py +122 -0
  187. inscriptis-2.7.0/tests/test_custom_html_tag_handling.py +31 -0
  188. inscriptis-2.7.0/tests/test_double_a.py +16 -0
  189. inscriptis-2.7.0/tests/test_empty_string.py +16 -0
  190. inscriptis-2.7.0/tests/test_engine.py +11 -0
  191. inscriptis-2.7.0/tests/test_html_conversion_options.py +72 -0
  192. inscriptis-2.7.0/tests/test_html_snippets.py +48 -0
  193. inscriptis-2.7.0/tests/test_html_snippets_annotations.py +63 -0
  194. inscriptis-2.7.0/tests/test_invalid_float_specification.py +16 -0
  195. inscriptis-2.7.0/tests/test_limit_whitespace_affixes.py +62 -0
  196. inscriptis-2.7.0/tests/test_list_div.py +29 -0
  197. inscriptis-2.7.0/tests/test_list_value.py +32 -0
  198. inscriptis-2.7.0/tests/test_margin_before_at_start.py +26 -0
  199. inscriptis-2.7.0/tests/test_margin_handling.py +36 -0
  200. inscriptis-2.7.0/tests/test_metadata.py +21 -0
  201. inscriptis-2.7.0/tests/test_model_html_element_canvas.py +55 -0
  202. inscriptis-2.7.0/tests/test_model_prefix.py +54 -0
  203. inscriptis-2.7.0/tests/test_parse_css.py +65 -0
  204. inscriptis-2.7.0/tests/test_strip_xml_header.py +10 -0
  205. inscriptis-2.7.0/tests/test_style_parsing.py +15 -0
  206. inscriptis-2.7.0/tests/test_table_cell.py +47 -0
  207. inscriptis-2.7.0/tests/test_table_cell_formatting.py +47 -0
  208. inscriptis-2.7.0/tests/test_table_row.py +26 -0
  209. inscriptis-2.7.0/tests/test_web_service.py +44 -0
  210. inscriptis-2.7.0/tests/test_white_space_handling.py +72 -0
  211. inscriptis-2.7.0/uv.lock +1399 -0
  212. inscriptis-2.6.0/pyproject.toml +0 -66
  213. {inscriptis-2.6.0 → inscriptis-2.7.0}/AUTHORS +0 -0
  214. {inscriptis-2.6.0 → inscriptis-2.7.0}/LICENSE +0 -0
  215. {inscriptis-2.6.0/src → inscriptis-2.7.0}/inscriptis/cli/__init__.py +0 -0
  216. {inscriptis-2.6.0/src → inscriptis-2.7.0}/inscriptis/html_properties.py +0 -0
  217. {inscriptis-2.6.0/src → inscriptis-2.7.0}/inscriptis/model/__init__.py +0 -0
  218. {inscriptis-2.6.0/src → inscriptis-2.7.0}/inscriptis/service/__init__.py +0 -0
@@ -0,0 +1,2 @@
1
+ [run]
2
+ omit = tests/
@@ -0,0 +1,2 @@
1
+ 55fa29ca39f9ed5895f9e88b2eb0f17e4d84245f
2
+ 4a41e301851661c9e74b851ecbeafa08767cd2d0
@@ -0,0 +1,18 @@
1
+ version: 2
2
+
3
+ updates:
4
+
5
+ # Enable version updates for github actions.
6
+ - package-ecosystem: "github-actions"
7
+ directory: "/"
8
+ schedule:
9
+ # Check for updates to GitHub Actions every weekday
10
+ interval: "weekly"
11
+
12
+ # Enable version updates for Docker.
13
+ - package-ecosystem: "docker"
14
+ # Look for a `Dockerfile` in the `root` directory
15
+ directory: "/"
16
+ # Check for updates once a week
17
+ schedule:
18
+ interval: "weekly"
@@ -0,0 +1,68 @@
1
+ # For most projects, this workflow file will not need changing; you simply need
2
+ # to commit it to your repository.
3
+ #
4
+ # You may wish to alter this file to override the set of languages analyzed,
5
+ # or to provide custom queries or build logic.
6
+ #
7
+ # ******** NOTE ********
8
+ # We have attempted to detect the languages in your repository. Please check
9
+ # the `language` matrix defined below to confirm you have the correct set of
10
+ # supported CodeQL languages.
11
+ #
12
+ name: "CodeQL"
13
+
14
+ on:
15
+ push:
16
+ pull_request:
17
+ schedule:
18
+ - cron: '26 5 * * 2'
19
+
20
+ jobs:
21
+ analyze:
22
+ name: Analyze
23
+ runs-on: ubuntu-latest
24
+ permissions:
25
+ actions: read
26
+ contents: read
27
+ security-events: write
28
+
29
+ strategy:
30
+ fail-fast: false
31
+ matrix:
32
+ language: [ 'python' ]
33
+ # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]
34
+ # Learn more:
35
+ # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed
36
+
37
+ steps:
38
+ - name: Checkout repository
39
+ uses: actions/checkout@v3
40
+
41
+ # Initializes the CodeQL tools for scanning.
42
+ - name: Initialize CodeQL
43
+ uses: github/codeql-action/init@v2
44
+ with:
45
+ languages: ${{ matrix.language }}
46
+ # If you wish to specify custom queries, you can do so here or in a config file.
47
+ # By default, queries listed here will override any specified in a config file.
48
+ # Prefix the list here with "+" to use these queries and those in the config file.
49
+ # queries: ./path/to/local/query, your-org/your-repo/queries@main
50
+
51
+ # Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
52
+ # If this step fails, then you should remove it and run the build manually (see below)
53
+ - name: Autobuild
54
+ uses: github/codeql-action/autobuild@v2
55
+
56
+ # ℹ️ Command-line programs to run using the OS shell.
57
+ # 📚 https://git.io/JvXDl
58
+
59
+ # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
60
+ # and modify them (or add more) to build your code if your project
61
+ # uses a compiled language
62
+
63
+ #- run: |
64
+ # make bootstrap
65
+ # make release
66
+
67
+ - name: Perform CodeQL Analysis
68
+ uses: github/codeql-action/analyze@v2
@@ -0,0 +1,35 @@
1
+ name: container
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - '*'
7
+
8
+ jobs:
9
+ build:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - name: Checkout code
13
+ uses: actions/checkout@v3
14
+
15
+ - name: get version
16
+ id: version
17
+ run: echo ::set-output name=APP_VERSION::${GITHUB_REF/refs\/tags\//}
18
+
19
+ - name: init docker build
20
+ uses: docker/setup-buildx-action@v2
21
+
22
+ - name: login docker
23
+ uses: docker/login-action@v2
24
+ with:
25
+ registry: ghcr.io
26
+ username: ${{ github.actor }}
27
+ password: ${{ secrets.GITHUB_TOKEN }}
28
+
29
+ - name: publish container
30
+ uses: docker/build-push-action@v4
31
+ with:
32
+ push: true
33
+ tags: |
34
+ ghcr.io/weblyzard/inscriptis:v${{ steps.version.outputs.APP_VERSION }}
35
+ ghcr.io/weblyzard/inscriptis:latest
@@ -0,0 +1,20 @@
1
+ name: helm release
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - PhilippKuntschik-patch-2
7
+ tags:
8
+ - '*'
9
+
10
+ jobs:
11
+ dispatch_helm_release:
12
+ runs-on: ubuntu-latest
13
+ steps:
14
+ - name: dispatch inscriptis-helm
15
+ uses: peter-evans/repository-dispatch@v2
16
+ with:
17
+ token: ${{ secrets.HELMREPO_ACCESS_TOKEN }}
18
+ repository: weblyzard/inscriptis-helm
19
+ event-type: tag-released
20
+ client-payload: '{"ref": "${{ github.ref_name }}"}'
@@ -0,0 +1,29 @@
1
+ name: build
2
+
3
+ on:
4
+ push:
5
+ pull_request:
6
+
7
+ jobs:
8
+ build:
9
+
10
+ runs-on: ubuntu-24.04
11
+ strategy:
12
+ fail-fast: false
13
+ matrix:
14
+ python-version: [ '3.10', '3.11', '3.12', '3.13' ]
15
+
16
+ steps:
17
+ - uses: actions/checkout@v3
18
+ - name: Set up Python ${{ matrix.python-version }}
19
+ uses: actions/setup-python@v4
20
+ with:
21
+ python-version: ${{ matrix.python-version }}
22
+ - name: Install build environment
23
+ run: |
24
+ python -m pip install --upgrade pip
25
+ python -m pip install uv
26
+ - name: Build and test with uv.
27
+ run: |
28
+ uv run ruff check
29
+ uv build
@@ -0,0 +1,30 @@
1
+ *.pyc
2
+ *.pyx
3
+ .*.swp
4
+ *.egg-info
5
+ __pycache__/
6
+ benchmarking_results/
7
+ html_cache/
8
+ .tox
9
+ build/
10
+ dist/
11
+ .cache/
12
+ .project
13
+ .pydevproject
14
+ .settings/
15
+ .pytest_cache/
16
+ .coverage
17
+ _build/
18
+ .mypy_cache/
19
+ .idea/
20
+ venv/
21
+ tests/converted.txt
22
+ tests/reference.txt
23
+ *.c
24
+ docs/paper/*.pdf
25
+ htmlcov/
26
+ poetry.lock
27
+
28
+ # test
29
+ converted.txt
30
+ reference.txt
@@ -0,0 +1,35 @@
1
+ # Read the Docs configuration file for Sphinx projects
2
+ # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
3
+
4
+ # Required
5
+ version: 2
6
+
7
+ # Set the OS, Python version and other tools you might need
8
+ build:
9
+ os: ubuntu-22.04
10
+ tools:
11
+ python: "3.12"
12
+ # You can also specify other tool versions:
13
+ # nodejs: "20"
14
+ # rust: "1.70"
15
+ # golang: "1.20"
16
+
17
+ # Build documentation in the "docs/" directory with Sphinx
18
+ sphinx:
19
+ configuration: docs/conf.py
20
+ # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs
21
+ # builder: "dirhtml"
22
+ # Fail on all warnings to avoid broken references
23
+ # fail_on_warning: true
24
+
25
+ # Optionally build your docs in additional formats such as PDF and ePub
26
+ formats:
27
+ - pdf
28
+ # - epub
29
+
30
+ # Optional but recommended, declare the Python requirements required
31
+ # to build your documentation
32
+ # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
33
+ python:
34
+ install:
35
+ - requirements: docs/requirements.txt
@@ -0,0 +1,5 @@
1
+ [project]
2
+ id = inscriptis
3
+ url = /codebases/inscriptis/findings
4
+ name = inscriptis
5
+
@@ -0,0 +1,54 @@
1
+ # Contributing to Inscriptis
2
+
3
+ First off, thank you for considering contributing to inscriptis.
4
+ There are many ways how you can contribute to the project and these guidelines aim at supporting you in doing so.
5
+
6
+ 1. [Reporting bugs and seeking support](#reporting-bugs-and-seeking-support)
7
+ 2. [Suggesting enhancements](#suggesting-enhancements)
8
+ 3. [Pull requests](#pull-requests) (contributing code)
9
+ 4. [Python style guide](#python-style-guide)
10
+
11
+
12
+ ## Reporting bugs and seeking support
13
+
14
+ Bugs and support requests are tracked as GitHub issues.
15
+
16
+ To create an effective and high quality ticket, please include the following information in your
17
+ ticket:
18
+
19
+ 1. **Use a clear and descriptive title** for the issue to identify the problem. This also helps other users to quickly locate bug reports that affect them.
20
+ 2. **Describe the exact steps necessary for reproducing the problem** including at least information on
21
+ - the affected URL
22
+ - the command line parameters or function arguments you used
23
+ 3. What would have been the **expected behavior**?
24
+ 4. Describe the **observed behavior**.
25
+ 5. Provide any additional information which might be helpful in reproducing and/or fixing this issue.
26
+
27
+
28
+ ## Suggesting enhancements
29
+
30
+ Enhancements are also tracked as GitHub issues and should contain the following information:
31
+
32
+ 1. **A clear and descriptive title** helps other people to identify enhancements they like, so that they can also add their thoughts and suggestions.
33
+ 2. **Provide a step-by-step description** of the suggested enhancement.
34
+ 3. **Describe the current behavior** and **explain which behavior you expected to see instead** and why.
35
+
36
+
37
+ ## Pull requests
38
+
39
+ 1. Ensure that your code complies with our [Python style guide](#python-style-guide).
40
+ 2. Write a unit test that covers your new code and put it into the `./tests` directory.
41
+ 3. Execute `tox .` in the project's root directory to ensure that your code passes the static code analysis, coding style guidelines and security checks.
42
+ 4. In addition, please document any new API functions in the Inscriptis documentation.
43
+
44
+
45
+ ## Python style guide
46
+
47
+ Inscriptis code should comply to
48
+ - the [PEP8 Style Guide for Python Code](https://www.python.org/dev/peps/pep-0008/), and
49
+ - to the [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html)
50
+
51
+ Please also ensure that
52
+ 1. functions are properly documented with docstrings that comply to the Google Python Style Guide, and
53
+ 2. any new code is covered by unit tests.
54
+
@@ -0,0 +1,24 @@
1
+ #
2
+ # Stage 1 - Install build dependencies
3
+ #
4
+ FROM python:3.11-slim-bullseye AS builder
5
+
6
+ WORKDIR /inscriptis
7
+ RUN python -m venv .venv && .venv/bin/python -m pip install --upgrade pip
8
+ RUN .venv/bin/pip install --no-cache-dir inscriptis[web-service] && \
9
+ find /inscriptis/.venv \( -type d -a -name test -o -name tests \) -o \( -type f -a -name '*.pyc' -o -name '*.pyo' \) -exec rm -rf '{}' \+
10
+
11
+ #
12
+ # Stage 2 - Copy only necessary files to the runner stage
13
+ #
14
+ FROM python:3.11-slim-bullseye
15
+ LABEL maintainer="albert@weichselbraun.net"
16
+
17
+ # Note: only copy the src directory, to prevent bloating the image with
18
+ # irrelevant files from the project directory.
19
+ WORKDIR /inscriptis
20
+ COPY --from=builder /inscriptis /inscriptis
21
+
22
+ ENV PATH="/inscriptis/.venv/bin:$PATH"
23
+ CMD ["uvicorn", "inscriptis.service.web:app", "--port=5000", "--host=0.0.0.0"]
24
+ EXPOSE 5000
@@ -1,32 +1,32 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: inscriptis
3
- Version: 2.6.0
3
+ Version: 2.7.0
4
4
  Summary: inscriptis - HTML to text converter.
5
- Home-page: https://github.com/weblyzard/inscriptis
6
- License: Apache-2.0
5
+ Project-URL: Homepage, https://github.com/weblyzard/inscriptis
6
+ Project-URL: Repository, https://github.com/weblyzard/inscriptis
7
+ Project-URL: Documentation, https://inscriptis.readthedocs.io/en/latest/
8
+ Author-email: Albert Weichselbraun <albert.weichselbraun@fhgr.ch>, Fabian Odoni <fabian.odoni@fhgr.ch>
9
+ License-Expression: Apache-2.0
10
+ License-File: AUTHORS
11
+ License-File: LICENSE
7
12
  Keywords: HTML,converter,text
8
- Author: Albert Weichselbraun
9
- Author-email: albert.weichselbraun@fhgr.ch
10
- Requires-Python: >=3.9,<4.0
11
13
  Classifier: Development Status :: 5 - Production/Stable
12
14
  Classifier: Intended Audience :: Developers
13
- Classifier: License :: OSI Approved :: Apache Software License
14
15
  Classifier: Programming Language :: Python :: 3
15
- Classifier: Programming Language :: Python :: 3.9
16
16
  Classifier: Programming Language :: Python :: 3.10
17
17
  Classifier: Programming Language :: Python :: 3.11
18
18
  Classifier: Programming Language :: Python :: 3.12
19
19
  Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Programming Language :: Python :: 3.14
20
21
  Classifier: Topic :: Text Processing
21
22
  Classifier: Topic :: Text Processing :: Markup :: HTML
22
23
  Classifier: Topic :: Utilities
24
+ Requires-Python: <3.15,>=3.10
25
+ Requires-Dist: lxml<6.0.0,>=5.4.0
26
+ Requires-Dist: requests<3.0.0,>=2.32.3
23
27
  Provides-Extra: web-service
24
- Requires-Dist: fastapi (>=0.115.11,<0.116.0) ; extra == "web-service"
25
- Requires-Dist: lxml (>=4.9.3)
26
- Requires-Dist: requests (>=2.32.2)
27
- Requires-Dist: uvicorn (>=0.34.0,<0.35.0) ; extra == "web-service"
28
- Project-URL: Documentation, https://inscriptis.readthedocs.io/en
29
- Project-URL: Repository, https://github.com/weblyzard/inscriptis
28
+ Requires-Dist: fastapi<1.0.0,>=0.115.11; extra == 'web-service'
29
+ Requires-Dist: uvicorn<1.0.0,>=0.34.0; extra == 'web-service'
30
30
  Description-Content-Type: text/x-rst
31
31
 
32
32
  ==================================================================================
@@ -37,10 +37,6 @@ inscriptis -- HTML to text conversion library, command line client and Web servi
37
37
  :target: https://badge.fury.io/py/inscriptis
38
38
  :alt: Supported python versions
39
39
 
40
- .. image:: https://api.codeclimate.com/v1/badges/f8ed73f8a764f2bc4eba/maintainability
41
- :target: https://codeclimate.com/github/weblyzard/inscriptis/maintainability
42
- :alt: Maintainability
43
-
44
40
  .. image:: https://codecov.io/gh/weblyzard/inscriptis/branch/master/graph/badge.svg
45
41
  :target: https://codecov.io/gh/weblyzard/inscriptis/
46
42
  :alt: Coverage
@@ -523,41 +519,112 @@ be used within a program:
523
519
  print("Text:", output['text'])
524
520
  print("Annotations:", output['label'])
525
521
 
526
- Fine tuning
527
- -----------
522
+ Fine-tuning the HTML rendering
523
+ ------------------------------
528
524
 
529
- The following options are available for fine tuning inscriptis' HTML rendering:
525
+ Inscriptis provides the ``ParserConfig`` class to fine-tune the HTML rendering
526
+ (`see documentation <https://inscriptis.readthedocs.io/en/latest/api.html#inscriptis.model.config.ParserConfig>`_).
530
527
 
531
- 1. **More rigorous indentation:** call ``inscriptis.get_text()`` with the
532
- parameter ``indentation='extended'`` to also use indentation for tags such as
533
- ``<div>`` and ``<span>`` that do not provide indentation in their standard
534
- definition. This strategy is the default in ``inscript`` and many other
535
- tools such as Lynx. If you do not want extended indentation you can use the
536
- parameter ``indentation='standard'`` instead.
528
+ It allows modifying the interpretation of HTML-tags and setting parameters that control the rendering of anchors,
529
+ captions, images and links.
537
530
 
538
- 2. **Overwriting the default CSS definition:** inscriptis uses CSS definitions
539
- that are maintained in ``inscriptis.css.CSS`` for rendering HTML tags. You can
540
- override these definitions (and therefore change the rendering) as outlined
541
- below:
531
+ 1. **Firefox-like whitespace handling:** Use the more standard-conform `strict` CSS_PROFILE to render the page.
532
+ (``<div>`` and ``<span>`` do not add whitespaces in the `strict` profile. Many text-based browsers such
533
+ as Lynx and ``inscript``, add whitespaces per default to reduce the likelihood of words getting glued together).
542
534
 
543
- .. code-block:: python
535
+ .. code-block:: python
536
+
537
+ from lxml.html import fromstring
538
+
539
+ from inscriptis import Inscriptis
540
+ from inscriptis.css_profiles import CSS_PROFILES
541
+ from inscriptis.model.config import ParserConfig
542
+
543
+ # create a ParserConfig that uses the strict CSS rendering profile
544
+ css = CSS_PROFILES['strict']
545
+ config = ParserConfig(css=css)
546
+
547
+ html_tree = fromstring(html)
548
+ parser = Inscriptis(html_tree, config)
549
+ text = parser.get_text()
550
+
551
+ 2. **Firefox-like whitespace handling and fine-tuning of link handling:** Use the strict profile
552
+ together with inline links and anchor URLs.
553
+
554
+ .. code-block:: python
555
+
556
+ from lxml.html import fromstring
557
+
558
+ from inscriptis import Inscriptis
559
+ from inscriptis.css_profiles import CSS_PROFILES
560
+ from inscriptis.model.config import ParserConfig
561
+
562
+ # uses the strict CSS rendering profile and fine-tune link handling.
563
+ css = CSS_PROFILES['strict']
564
+ config = ParserConfig(css=css, display_links=True,
565
+ display_anchors=True)
566
+
567
+ html_tree = fromstring(html)
568
+ parser = Inscriptis(html_tree, config)
569
+ text = parser.get_text()
570
+
571
+
572
+ 3. **Overwriting the default CSS definition:** inscriptis uses CSS definitions
573
+ that are maintained in ``inscriptis.css_profiles_CSS_PROFILES`` for
574
+ rendering HTML tags. You can override these definitions (and therefore
575
+ change the rendering) as outlined below:
576
+
577
+ .. code-block:: python
578
+
579
+ from lxml.html import fromstring
580
+
581
+ from inscriptis import Inscriptis
582
+ from inscriptis.css_profiles import CSS_PROFILES
583
+ from inscriptis.html_properties import Display
584
+ from inscriptis.model.config import ParserConfig
585
+ from inscriptis.model.html_element import HtmlElement
586
+
587
+ # Create a custom CSS based on the default style sheet and change the
588
+ # rendering of `div` and `span` elements.
589
+ css = CSS_PROFILES['strict'].copy()
590
+ css['div'] = HtmlElement(display=Display.block, padding=2)
591
+ css['span'] = HtmlElement(prefix=' ', suffix=' ')
592
+
593
+ html_tree = fromstring(html)
594
+ # create a parser using a custom css
595
+ config = ParserConfig(css=css)
596
+ parser = Inscriptis(html_tree, config)
597
+ text = parser.get_text()
598
+
599
+ 4. **Ignore elements during parsing:**
600
+ Overwriting the default CSS profile also allows changing the rendering of selected elements.
601
+ The snippet below, for example, removes forms from the parsed text by setting the definition of the ``form`` tag to ``Display.none``.
602
+
603
+ .. code-block:: python
604
+
605
+ from inscriptis import get_text
606
+ from inscriptis.css_profiles import CSS_PROFILES, HtmlElement
607
+ from inscriptis.html_properties import Display
608
+ from inscriptis.model.config import ParserConfig
609
+
610
+ # create a custom CSS based on the default style sheet and change the
611
+ # rendering of `div` and `span` elements
612
+ css = CSS_PROFILES['strict'].copy()
613
+ css['form'] = HtmlElement(display=Display.none)
614
+
615
+ # create a parser configuration using a custom css
616
+ html = """First line.
617
+ <form>
618
+ User data
619
+ <label for="name">Name:</label><br>
620
+ <input type="text" id="name" name="name"><br>
621
+ <label for="pass">Password:</label><br>
622
+ <input type="hidden" id="pass" name="pass">
623
+ </form>"""
624
+ config = ParserConfig(css=css)
625
+ text = get_text(html, config)
626
+ print(text)
544
627
 
545
- from lxml.html import fromstring
546
- from inscriptis.css_profiles import CSS_PROFILES, HtmlElement
547
- from inscriptis.html_properties import Display
548
- from inscriptis.model.config import ParserConfig
549
-
550
- # create a custom CSS based on the default style sheet and change the
551
- # rendering of `div` and `span` elements
552
- css = CSS_PROFILES['strict'].copy()
553
- css['div'] = HtmlElement(display=Display.block, padding=2)
554
- css['span'] = HtmlElement(prefix=' ', suffix=' ')
555
-
556
- html_tree = fromstring(html)
557
- # create a parser using a custom css
558
- config = ParserConfig(css=css)
559
- parser = Inscriptis(html_tree, config)
560
- text = parser.get_text()
561
628
 
562
629
 
563
630
  Custom HTML tag handling
@@ -601,55 +668,6 @@ The following code mitigates this problem on Unix systems by manually forcing lx
601
668
  return libc.malloc_trim(0)
602
669
 
603
670
 
604
- Examples
605
- ========
606
-
607
- Strict indentation handling
608
- ---------------------------
609
-
610
- The following example demonstrates modifying ``ParserConfig`` for strict indentation handling.
611
-
612
- .. code-block:: python
613
-
614
- from inscriptis import get_text
615
- from inscriptis.css_profiles import CSS_PROFILES
616
- from inscriptis.model.config import ParserConfig
617
-
618
- config = ParserConfig(css=CSS_PROFILES['strict'].copy())
619
- text = get_text('fi<span>r</span>st', config)
620
- print(text)
621
-
622
- Ignore elements during parsing
623
- ------------------------------
624
-
625
- Overwriting the default CSS profile also allows changing the rendering of selected elements.
626
- The snippet below, for example, removes forms from the parsed text by setting the definition of the ``form`` tag to ``Display.none``.
627
-
628
- .. code-block:: python
629
-
630
- from inscriptis import get_text
631
- from inscriptis.css_profiles import CSS_PROFILES, HtmlElement
632
- from inscriptis.html_properties import Display
633
- from inscriptis.model.config import ParserConfig
634
-
635
- # create a custom CSS based on the default style sheet and change the
636
- # rendering of `div` and `span` elements
637
- css = CSS_PROFILES['strict'].copy()
638
- css['form'] = HtmlElement(display=Display.none)
639
-
640
- # create a parser configuration using a custom css
641
- html = """First line.
642
- <form>
643
- User data
644
- <label for="name">Name:</label><br>
645
- <input type="text" id="name" name="name"><br>
646
- <label for="pass">Password:</label><br>
647
- <input type="hidden" id="pass" name="pass">
648
- </form>"""
649
- config = ParserConfig(css=css)
650
- text = get_text(html, config)
651
- print(text)
652
-
653
671
 
654
672
  Citation
655
673
  ========
@@ -678,4 +696,3 @@ Changelog
678
696
  A full list of changes can be found in the
679
697
  `release notes <https://github.com/weblyzard/inscriptis/releases>`_.
680
698
 
681
-