cocoindex 0.1.54__tar.gz → 0.1.55__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (250) hide show
  1. {cocoindex-0.1.54 → cocoindex-0.1.55}/.github/workflows/CI.yml +13 -1
  2. {cocoindex-0.1.54 → cocoindex-0.1.55}/Cargo.lock +2 -1
  3. {cocoindex-0.1.54 → cocoindex-0.1.55}/Cargo.toml +2 -2
  4. {cocoindex-0.1.54 → cocoindex-0.1.55}/PKG-INFO +2 -1
  5. cocoindex-0.1.55/docs/docs/ai/llm.mdx +309 -0
  6. cocoindex-0.1.55/docs/docs/ops/functions.md +142 -0
  7. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/docs/ops/sources.md +25 -25
  8. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/docs/ops/targets.md +25 -25
  9. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/code_embedding/main.py +1 -1
  10. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/text_embedding/main.py +1 -1
  11. {cocoindex-0.1.54 → cocoindex-0.1.55}/pyproject.toml +1 -0
  12. {cocoindex-0.1.54 → cocoindex-0.1.55}/python/cocoindex/cli.py +84 -5
  13. {cocoindex-0.1.54 → cocoindex-0.1.55}/python/cocoindex/convert.py +0 -5
  14. {cocoindex-0.1.54 → cocoindex-0.1.55}/python/cocoindex/tests/test_convert.py +4 -4
  15. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/base/schema.rs +1 -1
  16. cocoindex-0.1.55/src/builder/analyzed_flow.rs +66 -0
  17. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/builder/analyzer.rs +108 -296
  18. cocoindex-0.1.55/src/builder/exec_ctx.rs +275 -0
  19. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/builder/flow_builder.rs +41 -25
  20. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/builder/mod.rs +2 -0
  21. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/builder/plan.rs +0 -4
  22. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/execution/db_tracking.rs +23 -2
  23. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/execution/db_tracking_setup.rs +1 -0
  24. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/execution/dumper.rs +24 -11
  25. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/execution/evaluator.rs +3 -3
  26. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/execution/indexing_status.rs +3 -2
  27. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/execution/live_updater.rs +4 -4
  28. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/execution/memoization.rs +8 -1
  29. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/execution/row_indexer.rs +375 -46
  30. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/execution/source_indexer.rs +33 -21
  31. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/lib_context.rs +57 -16
  32. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/ops/factory_bases.rs +14 -10
  33. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/ops/functions/embed_text.rs +4 -4
  34. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/ops/functions/extract_by_llm.rs +4 -4
  35. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/ops/functions/parse_json.rs +4 -4
  36. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/ops/functions/split_recursively.rs +189 -52
  37. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/ops/interface.rs +5 -3
  38. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/ops/py_factory.rs +2 -1
  39. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/ops/registration.rs +9 -3
  40. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/ops/sources/amazon_s3.rs +1 -1
  41. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/ops/sources/google_drive.rs +1 -1
  42. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/ops/sources/local_file.rs +1 -1
  43. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/ops/targets/kuzu.rs +1 -1
  44. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/ops/targets/neo4j.rs +1 -1
  45. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/ops/targets/postgres.rs +1 -1
  46. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/ops/targets/qdrant.rs +1 -1
  47. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/prelude.rs +1 -1
  48. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/py/convert.rs +2 -4
  49. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/py/mod.rs +2 -0
  50. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/service/flows.rs +6 -0
  51. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/setup/driver.rs +17 -12
  52. cocoindex-0.1.54/docs/docs/ai/llm.mdx +0 -209
  53. cocoindex-0.1.54/docs/docs/ops/functions.md +0 -107
  54. cocoindex-0.1.54/src/builder/analyzed_flow.rs +0 -90
  55. {cocoindex-0.1.54 → cocoindex-0.1.55}/.cargo/config.toml +0 -0
  56. {cocoindex-0.1.54 → cocoindex-0.1.55}/.env.lib_debug +0 -0
  57. {cocoindex-0.1.54 → cocoindex-0.1.55}/.github/ISSUE_TEMPLATE//360/237/220/233-bug-report.md" +0 -0
  58. {cocoindex-0.1.54 → cocoindex-0.1.55}/.github/ISSUE_TEMPLATE//360/237/222/241-feature-request.md" +0 -0
  59. {cocoindex-0.1.54 → cocoindex-0.1.55}/.github/scripts/update_version.sh +0 -0
  60. {cocoindex-0.1.54 → cocoindex-0.1.55}/.github/workflows/_doc_release.yml +0 -0
  61. {cocoindex-0.1.54 → cocoindex-0.1.55}/.github/workflows/_test.yml +0 -0
  62. {cocoindex-0.1.54 → cocoindex-0.1.55}/.github/workflows/docs.yml +0 -0
  63. {cocoindex-0.1.54 → cocoindex-0.1.55}/.github/workflows/release.yml +0 -0
  64. {cocoindex-0.1.54 → cocoindex-0.1.55}/.gitignore +0 -0
  65. {cocoindex-0.1.54 → cocoindex-0.1.55}/.pre-commit-config.yaml +0 -0
  66. {cocoindex-0.1.54 → cocoindex-0.1.55}/.vscode/settings.json +0 -0
  67. {cocoindex-0.1.54 → cocoindex-0.1.55}/CODE_OF_CONDUCT.md +0 -0
  68. {cocoindex-0.1.54 → cocoindex-0.1.55}/CONTRIBUTING.md +0 -0
  69. {cocoindex-0.1.54 → cocoindex-0.1.55}/LICENSE +0 -0
  70. {cocoindex-0.1.54 → cocoindex-0.1.55}/README.md +0 -0
  71. {cocoindex-0.1.54 → cocoindex-0.1.55}/dev/neo4j.yaml +0 -0
  72. {cocoindex-0.1.54 → cocoindex-0.1.55}/dev/postgres.yaml +0 -0
  73. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/.gitignore +0 -0
  74. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/README.md +0 -0
  75. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/docs/about/community.md +0 -0
  76. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/docs/about/contributing.md +0 -0
  77. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/docs/core/basics.md +0 -0
  78. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/docs/core/cli.mdx +0 -0
  79. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/docs/core/custom_function.mdx +0 -0
  80. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/docs/core/data_example.svg +0 -0
  81. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/docs/core/data_types.mdx +0 -0
  82. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/docs/core/flow_def.mdx +0 -0
  83. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/docs/core/flow_example.svg +0 -0
  84. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/docs/core/flow_methods.mdx +0 -0
  85. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/docs/core/settings.mdx +0 -0
  86. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/docs/getting_started/installation.md +0 -0
  87. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/docs/getting_started/markdown_files.zip +0 -0
  88. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/docs/getting_started/overview.md +0 -0
  89. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/docs/getting_started/quickstart.md +0 -0
  90. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/docs/query.mdx +0 -0
  91. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/docusaurus.config.ts +0 -0
  92. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/package.json +0 -0
  93. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/sidebars.ts +0 -0
  94. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/src/components/HomepageFeatures/index.tsx +0 -0
  95. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/src/components/HomepageFeatures/styles.module.css +0 -0
  96. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/src/css/custom.css +0 -0
  97. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/src/theme/Root.js +0 -0
  98. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/static/.nojekyll +0 -0
  99. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/static/img/docusaurus.png +0 -0
  100. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/static/img/favicon.ico +0 -0
  101. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/static/img/icon.svg +0 -0
  102. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/static/img/incremental-etl.gif +0 -0
  103. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/static/robots.txt +0 -0
  104. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/tsconfig.json +0 -0
  105. {cocoindex-0.1.54 → cocoindex-0.1.55}/docs/yarn.lock +0 -0
  106. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/amazon_s3_embedding/.env.example +0 -0
  107. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/amazon_s3_embedding/.gitignore +0 -0
  108. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/amazon_s3_embedding/README.md +0 -0
  109. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/amazon_s3_embedding/main.py +0 -0
  110. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/amazon_s3_embedding/pyproject.toml +0 -0
  111. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/code_embedding/.env +0 -0
  112. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/code_embedding/README.md +0 -0
  113. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/code_embedding/pyproject.toml +0 -0
  114. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/docs_to_knowledge_graph/.env +0 -0
  115. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/docs_to_knowledge_graph/README.md +0 -0
  116. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/docs_to_knowledge_graph/main.py +0 -0
  117. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/docs_to_knowledge_graph/pyproject.toml +0 -0
  118. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/fastapi_server_docker/.dockerignore +0 -0
  119. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/fastapi_server_docker/.env +0 -0
  120. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/fastapi_server_docker/README.md +0 -0
  121. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/fastapi_server_docker/compose.yaml +0 -0
  122. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/fastapi_server_docker/dockerfile +0 -0
  123. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/fastapi_server_docker/files/1810.04805v2.md +0 -0
  124. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/fastapi_server_docker/main.py +0 -0
  125. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/fastapi_server_docker/requirements.txt +0 -0
  126. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/gdrive_text_embedding/.env.example +0 -0
  127. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/gdrive_text_embedding/.gitignore +0 -0
  128. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/gdrive_text_embedding/README.md +0 -0
  129. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/gdrive_text_embedding/main.py +0 -0
  130. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/gdrive_text_embedding/pyproject.toml +0 -0
  131. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/image_search/.env +0 -0
  132. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/image_search/README.md +0 -0
  133. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/image_search/frontend/.gitignore +0 -0
  134. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/image_search/frontend/index.html +0 -0
  135. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/image_search/frontend/package-lock.json +0 -0
  136. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/image_search/frontend/package.json +0 -0
  137. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/image_search/frontend/src/App.jsx +0 -0
  138. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/image_search/frontend/src/main.jsx +0 -0
  139. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/image_search/frontend/src/style.css +0 -0
  140. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/image_search/frontend/vite.config.js +0 -0
  141. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/image_search/img/cat1.jpeg +0 -0
  142. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/image_search/img/dog1.jpeg +0 -0
  143. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/image_search/img/elephant1.jpg +0 -0
  144. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/image_search/img/giraffe.jpg +0 -0
  145. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/image_search/main.py +0 -0
  146. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/image_search/pyproject.toml +0 -0
  147. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/image_search/requirements.txt +0 -0
  148. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/manuals_llm_extraction/.env +0 -0
  149. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/manuals_llm_extraction/README.md +0 -0
  150. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/manuals_llm_extraction/main.py +0 -0
  151. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/manuals_llm_extraction/manuals/array.pdf +0 -0
  152. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/manuals_llm_extraction/manuals/base64.pdf +0 -0
  153. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/manuals_llm_extraction/manuals/copy.pdf +0 -0
  154. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/manuals_llm_extraction/manuals/glob.pdf +0 -0
  155. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/manuals_llm_extraction/pyproject.toml +0 -0
  156. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/pdf_embedding/.env +0 -0
  157. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/pdf_embedding/README.md +0 -0
  158. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/pdf_embedding/main.py +0 -0
  159. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/pdf_embedding/pdf_files/1706.03762v7.pdf +0 -0
  160. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/pdf_embedding/pdf_files/1810.04805v2.pdf +0 -0
  161. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/pdf_embedding/pdf_files/rfc8259.pdf +0 -0
  162. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/pdf_embedding/pyproject.toml +0 -0
  163. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/product_recommendation/.env +0 -0
  164. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/product_recommendation/README.md +0 -0
  165. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/product_recommendation/img/cocoinsight.png +0 -0
  166. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/product_recommendation/img/neo4j.png +0 -0
  167. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/product_recommendation/main.py +0 -0
  168. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/product_recommendation/products/p1.json +0 -0
  169. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/product_recommendation/products/p2.json +0 -0
  170. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/product_recommendation/products/p3.json +0 -0
  171. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/product_recommendation/products/p4.json +0 -0
  172. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/product_recommendation/products/p5.json +0 -0
  173. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/product_recommendation/products/p6.json +0 -0
  174. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/product_recommendation/products/p7.json +0 -0
  175. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/product_recommendation/products/p8.json +0 -0
  176. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/product_recommendation/products/p9.json +0 -0
  177. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/product_recommendation/pyproject.toml +0 -0
  178. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/text_embedding/.env +0 -0
  179. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/text_embedding/README.md +0 -0
  180. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/text_embedding/Text_Embedding.ipynb +0 -0
  181. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/text_embedding/markdown_files/1706.03762v7.md +0 -0
  182. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/text_embedding/markdown_files/1810.04805v2.md +0 -0
  183. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/text_embedding/markdown_files/rfc8259.md +0 -0
  184. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/text_embedding/pyproject.toml +0 -0
  185. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/text_embedding_qdrant/.env +0 -0
  186. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/text_embedding_qdrant/README.md +0 -0
  187. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/text_embedding_qdrant/main.py +0 -0
  188. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/text_embedding_qdrant/markdown_files/rfc8259.md +0 -0
  189. {cocoindex-0.1.54 → cocoindex-0.1.55}/examples/text_embedding_qdrant/pyproject.toml +0 -0
  190. {cocoindex-0.1.54 → cocoindex-0.1.55}/python/cocoindex/__init__.py +0 -0
  191. {cocoindex-0.1.54 → cocoindex-0.1.55}/python/cocoindex/auth_registry.py +0 -0
  192. {cocoindex-0.1.54 → cocoindex-0.1.55}/python/cocoindex/flow.py +0 -0
  193. {cocoindex-0.1.54 → cocoindex-0.1.55}/python/cocoindex/functions.py +0 -0
  194. {cocoindex-0.1.54 → cocoindex-0.1.55}/python/cocoindex/index.py +0 -0
  195. {cocoindex-0.1.54 → cocoindex-0.1.55}/python/cocoindex/lib.py +0 -0
  196. {cocoindex-0.1.54 → cocoindex-0.1.55}/python/cocoindex/llm.py +0 -0
  197. {cocoindex-0.1.54 → cocoindex-0.1.55}/python/cocoindex/op.py +0 -0
  198. {cocoindex-0.1.54 → cocoindex-0.1.55}/python/cocoindex/py.typed +0 -0
  199. {cocoindex-0.1.54 → cocoindex-0.1.55}/python/cocoindex/runtime.py +0 -0
  200. {cocoindex-0.1.54 → cocoindex-0.1.55}/python/cocoindex/setting.py +0 -0
  201. {cocoindex-0.1.54 → cocoindex-0.1.55}/python/cocoindex/setup.py +0 -0
  202. {cocoindex-0.1.54 → cocoindex-0.1.55}/python/cocoindex/sources.py +0 -0
  203. {cocoindex-0.1.54 → cocoindex-0.1.55}/python/cocoindex/targets.py +0 -0
  204. {cocoindex-0.1.54 → cocoindex-0.1.55}/python/cocoindex/tests/__init__.py +0 -0
  205. {cocoindex-0.1.54 → cocoindex-0.1.55}/python/cocoindex/tests/test_optional_database.py +0 -0
  206. {cocoindex-0.1.54 → cocoindex-0.1.55}/python/cocoindex/tests/test_typing.py +0 -0
  207. {cocoindex-0.1.54 → cocoindex-0.1.55}/python/cocoindex/typing.py +0 -0
  208. {cocoindex-0.1.54 → cocoindex-0.1.55}/python/cocoindex/utils.py +0 -0
  209. {cocoindex-0.1.54 → cocoindex-0.1.55}/ruff.toml +0 -0
  210. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/base/duration.rs +0 -0
  211. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/base/field_attrs.rs +0 -0
  212. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/base/json_schema.rs +0 -0
  213. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/base/mod.rs +0 -0
  214. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/base/spec.rs +0 -0
  215. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/base/value.rs +0 -0
  216. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/execution/mod.rs +0 -0
  217. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/execution/stats.rs +0 -0
  218. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/lib.rs +0 -0
  219. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/llm/anthropic.rs +0 -0
  220. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/llm/gemini.rs +0 -0
  221. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/llm/litellm.rs +0 -0
  222. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/llm/mod.rs +0 -0
  223. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/llm/ollama.rs +0 -0
  224. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/llm/openai.rs +0 -0
  225. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/llm/openrouter.rs +0 -0
  226. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/llm/voyage.rs +0 -0
  227. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/ops/functions/mod.rs +0 -0
  228. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/ops/mod.rs +0 -0
  229. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/ops/registry.rs +0 -0
  230. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/ops/sdk.rs +0 -0
  231. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/ops/sources/mod.rs +0 -0
  232. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/ops/targets/mod.rs +0 -0
  233. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/ops/targets/shared/mod.rs +0 -0
  234. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/ops/targets/shared/property_graph.rs +0 -0
  235. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/ops/targets/shared/table_columns.rs +0 -0
  236. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/server.rs +0 -0
  237. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/service/error.rs +0 -0
  238. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/service/mod.rs +0 -0
  239. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/settings.rs +0 -0
  240. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/setup/auth_registry.rs +0 -0
  241. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/setup/components.rs +0 -0
  242. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/setup/db_metadata.rs +0 -0
  243. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/setup/mod.rs +0 -0
  244. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/setup/states.rs +0 -0
  245. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/utils/db.rs +0 -0
  246. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/utils/fingerprint.rs +0 -0
  247. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/utils/immutable.rs +0 -0
  248. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/utils/mod.rs +0 -0
  249. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/utils/retryable.rs +0 -0
  250. {cocoindex-0.1.54 → cocoindex-0.1.55}/src/utils/yaml_ser.rs +0 -0
@@ -26,7 +26,19 @@ permissions:
26
26
  contents: read
27
27
 
28
28
  jobs:
29
- format-check:
29
+ rust-format-check:
30
+ name: Check Rust formatting
31
+ runs-on: ubuntu-latest
32
+ steps:
33
+ - uses: actions/checkout@v4
34
+ - uses: dtolnay/rust-toolchain@stable
35
+ with:
36
+ components: rustfmt
37
+ - name: Check Rust formatting
38
+ run: |
39
+ cargo fmt --check
40
+
41
+ python-format-check:
30
42
  name: Check Python formatting
31
43
  runs-on: ubuntu-latest
32
44
  steps:
@@ -1040,7 +1040,7 @@ dependencies = [
1040
1040
 
1041
1041
  [[package]]
1042
1042
  name = "cocoindex"
1043
- version = "0.1.54"
1043
+ version = "0.1.55"
1044
1044
  dependencies = [
1045
1045
  "anyhow",
1046
1046
  "async-openai",
@@ -3293,6 +3293,7 @@ dependencies = [
3293
3293
  "pyo3-ffi",
3294
3294
  "pyo3-macros",
3295
3295
  "unindent",
3296
+ "uuid",
3296
3297
  ]
3297
3298
 
3298
3299
  [[package]]
@@ -2,7 +2,7 @@
2
2
  name = "cocoindex"
3
3
  # Version used for local development is always higher than others to take precedence.
4
4
  # Will be overridden for specific release versions.
5
- version = "0.1.54"
5
+ version = "0.1.55"
6
6
  edition = "2024"
7
7
  rust-version = "1.86"
8
8
 
@@ -15,7 +15,7 @@ name = "cocoindex_engine"
15
15
  crate-type = ["cdylib"]
16
16
 
17
17
  [dependencies]
18
- pyo3 = { version = "0.25.0", features = ["chrono", "auto-initialize"] }
18
+ pyo3 = { version = "0.25.0", features = ["chrono", "auto-initialize", "uuid"] }
19
19
  pythonize = "0.25.0"
20
20
  pyo3-async-runtimes = { version = "0.25.0", features = ["tokio-runtime"] }
21
21
 
@@ -1,10 +1,11 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cocoindex
3
- Version: 0.1.54
3
+ Version: 0.1.55
4
4
  Requires-Dist: sentence-transformers>=3.3.1
5
5
  Requires-Dist: click>=8.1.8
6
6
  Requires-Dist: rich>=14.0.0
7
7
  Requires-Dist: python-dotenv>=1.1.0
8
+ Requires-Dist: watchfiles>=1.1.0
8
9
  Requires-Dist: pytest ; extra == 'test'
9
10
  Requires-Dist: ruff ; extra == 'dev'
10
11
  Requires-Dist: pre-commit ; extra == 'dev'
@@ -0,0 +1,309 @@
1
+ ---
2
+ title: LLM Support
3
+ description: LLMs integrated with CocoIndex for various built-in functions
4
+ ---
5
+
6
+ import Tabs from '@theme/Tabs';
7
+ import TabItem from '@theme/TabItem';
8
+
9
+ CocoIndex provides builtin functions integrating with various LLM APIs, for various inference tasks:
10
+ * [Text Generation](#text-generation): use LLM to generate text.
11
+ * [Text Embedding](#text-embedding): embed text into a vector space.
12
+
13
+ ## LLM API Types
14
+
15
+ We support integrating with LLM with different types of APIs.
16
+ Each LLM API type is specified by a `cocoindex.LlmApiType` enum.
17
+
18
+ We support the following types of LLM APIs:
19
+
20
+ | API Name | `LlmApiType` enum | Text Generation | Text Embedding |
21
+ |----------|---------------------|--------------------|--------------------|
22
+ | [OpenAI](#openai) | `LlmApiType.OPENAI` | ✅ | ✅ |
23
+ | [Ollama](#ollama) | `LlmApiType.OLLAMA` | ✅ | ❌ |
24
+ | [Google Gemini](#google-gemini) | `LlmApiType.GEMINI` | ✅ | ✅ |
25
+ | [Anthropic](#anthropic) | `LlmApiType.ANTHROPIC` | ✅ | ❌ |
26
+ | [Voyage](#voyage) | `LlmApiType.VOYAGE` | ❌ | ✅ |
27
+ | [LiteLLM](#litellm) | `LlmApiType.LITE_LLM` | ✅ | ❌ |
28
+ | [OpenRouter](#openrouter) | `LlmApiType.OPEN_ROUTER` | ✅ | ❌ |
29
+
30
+ ## LLM Tasks
31
+
32
+ ### Text Generation
33
+
34
+ Generation is used as a building block for certain CocoIndex functions that process data using LLM generation.
35
+
36
+ We have one builtin functions using LLM generation for now:
37
+
38
+ * [`ExtractByLlm`](/docs/ops/functions#extractbyllm): it extracts information from input text.
39
+
40
+ #### LLM Spec
41
+
42
+ When calling a CocoIndex function that uses LLM generation, you need to provide a `cocoindex.LlmSpec` dataclass, to configure the LLM you want to use in these functions.
43
+ It has the following fields:
44
+
45
+ * `api_type` (type: [`cocoindex.LlmApiType`](/docs/ai/llm#llm-api-types), required): The type of integrated LLM API to use, e.g. `cocoindex.LlmApiType.OPENAI` or `cocoindex.LlmApiType.OLLAMA`.
46
+ See supported LLM APIs in the [LLM API integrations](#llm-api-integrations) section below.
47
+ * `model` (type: `str`, required): The name of the LLM model to use.
48
+ * `address` (type: `str`, optional): The address of the LLM API.
49
+
50
+
51
+ ### Text Embedding
52
+
53
+ Embedding means converting text into a vector space, usually for similarity matching.
54
+
55
+ We provide a builtin function [`EmbedText`](/docs/ops/functions#embedtext) that converts a given text into a vector space.
56
+ The spec takes the following fields:
57
+
58
+ * `api_type` (type: `cocoindex.LlmApiType`, required)
59
+ * `model` (type: `str`, required)
60
+ * `address` (type: `str`, optional)
61
+ * `output_dimension` (type: `int`, optional)
62
+ * `task_type` (type: `str`, optional)
63
+
64
+ See documentation for [`EmbedText`](/docs/ops/functions#embedtext) for more details about these fields.
65
+
66
+ ## LLM API Integrations
67
+
68
+ CocoIndex integrates with various LLM APIs for these functions.
69
+
70
+ ### OpenAI
71
+
72
+ To use the OpenAI LLM API, you need to set the environment variable `OPENAI_API_KEY`.
73
+ You can generate the API key from [OpenAI Dashboard](https://platform.openai.com/api-keys).
74
+
75
+ Currently we don't support custom address for OpenAI API.
76
+
77
+ You can find the full list of models supported by OpenAI [here](https://platform.openai.com/docs/models).
78
+
79
+ For text generation, a spec for OpenAI looks like this:
80
+
81
+ <Tabs>
82
+ <TabItem value="python" label="Python" default>
83
+
84
+ ```python
85
+ cocoindex.LlmSpec(
86
+ api_type=cocoindex.LlmApiType.OPENAI,
87
+ model="gpt-4o",
88
+ )
89
+ ```
90
+
91
+ </TabItem>
92
+ </Tabs>
93
+
94
+ For text embedding, a spec for OpenAI looks like this:
95
+
96
+ <Tabs>
97
+ <TabItem value="python" label="Python" default>
98
+
99
+ ```python
100
+ cocoindex.functions.EmbedText(
101
+ api_type=cocoindex.LlmApiType.OPENAI,
102
+ model="text-embedding-3-small",
103
+ )
104
+ ```
105
+
106
+ </TabItem>
107
+ </Tabs>
108
+
109
+ ### Ollama
110
+
111
+ [Ollama](https://ollama.com/) allows you to run LLM models on your local machine easily. To get started:
112
+
113
+ * [Download](https://ollama.com/download) and install Ollama.
114
+ * Pull your favorite LLM models by the `ollama pull` command, e.g.
115
+ ```bash
116
+ ollama pull llama3.2
117
+ ```
118
+ You can find the [list of models](https://ollama.com/library) supported by Ollama.
119
+
120
+ A spec for Ollama looks like this:
121
+
122
+ <Tabs>
123
+ <TabItem value="python" label="Python" default>
124
+
125
+ ```python
126
+ cocoindex.LlmSpec(
127
+ api_type=cocoindex.LlmApiType.OLLAMA,
128
+ model="llama3.2:latest",
129
+ # Optional, use Ollama's default port (11434) on localhost if not specified
130
+ address="http://localhost:11434",
131
+ )
132
+ ```
133
+
134
+ </TabItem>
135
+ </Tabs>
136
+
137
+ ### Google Gemini
138
+
139
+ To use the Gemini LLM API, you need to set the environment variable `GEMINI_API_KEY`.
140
+ You can generate the API key from [Google AI Studio](https://aistudio.google.com/apikey).
141
+
142
+ You can find the full list of models supported by Gemini [here](https://ai.google.dev/gemini-api/docs/models).
143
+
144
+ For text generation, a spec looks like this:
145
+
146
+ <Tabs>
147
+ <TabItem value="python" label="Python" default>
148
+
149
+ ```python
150
+ cocoindex.LlmSpec(
151
+ api_type=cocoindex.LlmApiType.GEMINI,
152
+ model="gemini-2.0-flash",
153
+ )
154
+ ```
155
+
156
+ </TabItem>
157
+ </Tabs>
158
+
159
+ For text embedding, a spec looks like this:
160
+
161
+ <Tabs>
162
+ <TabItem value="python" label="Python" default>
163
+
164
+ ```python
165
+ cocoindex.functions.EmbedText(
166
+ api_type=cocoindex.LlmApiType.GEMINI,
167
+ model="text-embedding-004",
168
+ task_type="SEMANTICS_SIMILARITY",
169
+ )
170
+ ```
171
+
172
+ All supported embedding models can be found [here](https://ai.google.dev/gemini-api/docs/embeddings#embeddings-models).
173
+ Gemini supports task type (optional), which can be found [here](https://ai.google.dev/gemini-api/docs/embeddings#supported-task-types).
174
+
175
+
176
+ </TabItem>
177
+ </Tabs>
178
+
179
+ ### Anthropic
180
+
181
+ To use the Anthropic LLM API, you need to set the environment variable `ANTHROPIC_API_KEY`.
182
+ You can generate the API key from [Anthropic API](https://console.anthropic.com/settings/keys).
183
+
184
+ A text generation spec for Anthropic looks like this:
185
+
186
+ <Tabs>
187
+ <TabItem value="python" label="Python" default>
188
+
189
+ ```python
190
+ cocoindex.LlmSpec(
191
+ api_type=cocoindex.LlmApiType.ANTHROPIC,
192
+ model="claude-3-5-sonnet-latest",
193
+ )
194
+ ```
195
+
196
+ </TabItem>
197
+ </Tabs>
198
+
199
+ You can find the full list of models supported by Anthropic [here](https://docs.anthropic.com/en/docs/about-claude/models/all-models).
200
+
201
+ ### Voyage
202
+
203
+ To use the Voyage LLM API, you need to set the environment variable `VOYAGE_API_KEY`.
204
+ You can generate the API key from [Voyage dashboard](https://dashboard.voyageai.com/organization/api-keys).
205
+
206
+ A text embedding spec for Voyage looks like this:
207
+
208
+ <Tabs>
209
+ <TabItem value="python" label="Python" default>
210
+
211
+ ```python
212
+ cocoindex.functions.EmbedText(
213
+ api_type=cocoindex.LlmApiType.VOYAGE,
214
+ model="voyage-code-3",
215
+ task_type="document",
216
+ )
217
+ ```
218
+
219
+ </TabItem>
220
+ </Tabs>
221
+
222
+ Voyage API supports `document` and `query` as task types (optional, a.k.a. `input_type` in Voyage API, see [Voyage API documentation](https://docs.voyageai.com/reference/embeddings-api) for details).
223
+
224
+ ### LiteLLM
225
+
226
+ To use the LiteLLM API, you need to set the environment variable `LITELLM_API_KEY`.
227
+
228
+ #### 1. Install LiteLLM Proxy
229
+
230
+ ```bash
231
+ pip install 'litellm[proxy]'
232
+ ```
233
+
234
+ #### 2. Create a `config.yml` for LiteLLM
235
+
236
+ **Example for DeepSeek:**
237
+
238
+ Use this in your `config.yml`:
239
+
240
+ ```yaml
241
+ model_list:
242
+ - model_name: deepseek-chat
243
+ litellm_params:
244
+ model: deepseek/deepseek-chat
245
+ api_key: os.environ/DEEPSEEK_API_KEY
246
+ ```
247
+
248
+ You need to set the environment variable `DEEPSEEK_API_KEY` to your DeepSeek API key.
249
+
250
+ **Example for Groq:**
251
+
252
+ Use this in your `config.yml`:
253
+
254
+ ```yaml
255
+ model_list:
256
+ - model_name: groq-llama-3.3-70b-versatile
257
+ litellm_params:
258
+ model: groq/llama-3.3-70b-versatile
259
+ api_key: "os.environ/GROQ_API_KEY"
260
+ ```
261
+
262
+ You need to set the environment variable `GROQ_API_KEY` to your Groq API key.
263
+
264
+
265
+ #### 3. Run LiteLLM Proxy
266
+
267
+ ```bash
268
+ litellm --config config.yml
269
+ ```
270
+
271
+ #### 4. A Spec for LiteLLM will look like this:
272
+
273
+ <Tabs>
274
+ <TabItem value="python" label="Python" default>
275
+
276
+ ```python
277
+ cocoindex.LlmSpec(
278
+ api_type=cocoindex.LlmApiType.LITE_LLM,
279
+ model="deepseek-chat",
280
+ address="http://127.0.0.1:4000", # default url of LiteLLM
281
+ )
282
+ ```
283
+
284
+ </TabItem>
285
+ </Tabs>
286
+
287
+ You can find the full list of models supported by LiteLLM [here](https://docs.litellm.ai/docs/providers).
288
+
289
+ ### OpenRouter
290
+
291
+ To use the OpenRouter API, you need to set the environment variable `OPENROUTER_API_KEY`.
292
+ You can generate the API key from [here](https://openrouter.ai/settings/keys).
293
+
294
+ A spec for OpenRouter looks like this:
295
+
296
+ <Tabs>
297
+ <TabItem value="python" label="Python" default>
298
+
299
+ ```python
300
+ cocoindex.LlmSpec(
301
+ api_type=cocoindex.LlmApiType.OPEN_ROUTER,
302
+ model="deepseek/deepseek-r1:free",
303
+ )
304
+ ```
305
+
306
+ </TabItem>
307
+ </Tabs>
308
+
309
+ You can find the full list of models supported by OpenRouter [here](https://openrouter.ai/models).
@@ -0,0 +1,142 @@
1
+ ---
2
+ title: Functions
3
+ description: CocoIndex Built-in Functions
4
+ ---
5
+
6
+ # CocoIndex Built-in Functions
7
+
8
+ ## ParseJson
9
+
10
+ `ParseJson` parses a given text to JSON.
11
+
12
+ The spec takes the following fields:
13
+
14
+ * `text` (`str`): The source text to parse.
15
+ * `language` (`str`, optional): The language of the source text. Only `json` is supported now. Default to `json`.
16
+
17
+ Return: *Json*
18
+
19
+ ## SplitRecursively
20
+
21
+ `SplitRecursively` splits a document into chunks of a given size.
22
+ It tries to split at higher-level boundaries. If each chunk is still too large, it tries at the next level of boundaries.
23
+ For example, for a Markdown file, it identifies boundaries in this order: level-1 sections, level-2 sections, level-3 sections, paragraphs, sentences, etc.
24
+
25
+ The spec takes the following fields:
26
+
27
+ * `custom_languages` (`list[CustomLanguageSpec]`, optional): This allows you to customize the way to chunking specific languages using regular expressions. Each `CustomLanguageSpec` is a dict with the following fields:
28
+ * `language_name` (`str`): Name of the language.
29
+ * `aliases` (`list[str]`, optional): A list of aliases for the language.
30
+ It's an error if any language name or alias is duplicated.
31
+
32
+ * `separators_regex` (`list[str]`): A list of regex patterns to split the text.
33
+ Higher-level boundaries should come first, and lower-level should be listed later. e.g. `[r"\n# ", r"\n## ", r"\n\n", r"\. "]`.
34
+ See [regex Syntax](https://docs.rs/regex/latest/regex/#syntax) for supported regular expression syntax.
35
+
36
+ Input data:
37
+
38
+ * `text` (*Str*): The text to split.
39
+ * `chunk_size` (*Int64*): The maximum size of each chunk, in bytes.
40
+ * `min_chunk_size` (*Int64*, optional): The minimum size of each chunk, in bytes. If not provided, default to `chunk_size / 2`.
41
+
42
+ :::note
43
+
44
+ `SplitRecursively` will do its best to make the output chunks sized between `min_chunk_size` and `chunk_size`.
45
+ However, it's possible that some chunks are smaller than `min_chunk_size` or larger than `chunk_size` in rare cases, e.g. too short input text, or non-splittable large text.
46
+
47
+ Please avoid setting `min_chunk_size` to a value too close to `chunk_size`, to leave more rooms for the function to plan the optimal chunking.
48
+
49
+ :::
50
+
51
+ * `chunk_overlap` (*Int64*, optional): The maximum overlap size between adjacent chunks, in bytes.
52
+ * `language` (*Str*, optional): The language of the document.
53
+ Can be a language name (e.g. `Python`, `Javascript`, `Markdown`) or a file extension (e.g. `.py`, `.js`, `.md`).
54
+
55
+
56
+ :::note
57
+
58
+ We use the `language` field to determine how to split the input text, following these rules:
59
+
60
+ * We'll match the input `language` field against the `language_name` or `aliases` of each element of `custom_languages`, and use the matched one. If value of `language` is null, it'll be treated as empty string when matching `language_name` or `aliases`.
61
+ * If no match is found, we'll match the `language` field against the builtin language configurations.
62
+ For all supported builtin language names and aliases (extensions), see [the code](https://github.com/search?q=org%3Acocoindex-io+lang%3Arust++%22static+TREE_SITTER_LANGUAGE_BY_LANG%22&type=code).
63
+ * If no match is found, the input will be treated as plain text.
64
+
65
+ :::
66
+
67
+ Return: [*KTable*](/docs/core/data_types#ktable), each row represents a chunk, with the following sub fields:
68
+
69
+ * `location` (*Range*): The location of the chunk.
70
+ * `text` (*Str*): The text of the chunk.
71
+ * `start` / `end` (*Struct*): Details about the start position (inclusive) and end position (exclusive) of the chunk. They have the following sub fields:
72
+ * `offset` (*Int64*): The byte offset of the position.
73
+ * `line` (*Int64*): The line number of the position. Starting from 1.
74
+ * `column` (*Int64*): The column number of the position. Starting from 1.
75
+
76
+ ## SentenceTransformerEmbed
77
+
78
+ `SentenceTransformerEmbed` embeds a text into a vector space using the [SentenceTransformer](https://huggingface.co/sentence-transformers) library.
79
+
80
+ The spec takes the following fields:
81
+
82
+ * `model` (`str`): The name of the SentenceTransformer model to use.
83
+ * `args` (`dict[str, Any]`, optional): Additional arguments to pass to the SentenceTransformer constructor. e.g. `{"trust_remote_code": True}`
84
+
85
+ Input data:
86
+
87
+ * `text` (*Str*): The text to embed.
88
+
89
+ Return: *Vector[Float32, N]*, where *N* is determined by the model
90
+
91
+ ## ExtractByLlm
92
+
93
+ `ExtractByLlm` extracts structured information from a text using specified LLM. The spec takes the following fields:
94
+
95
+ * `llm_spec` (`cocoindex.LlmSpec`): The specification of the LLM to use. See [LLM Spec](/docs/ai/llm#llm-spec) for more details.
96
+ * `output_type` (`type`): The type of the output. e.g. a dataclass type name. See [Data Types](/docs/core/data_types) for all supported data types. The LLM will output values that match the schema of the type.
97
+ * `instruction` (`str`, optional): Additional instruction for the LLM.
98
+
99
+ :::tip Clear type definitions
100
+
101
+ Definitions of the `output_type` is fed into LLM as guidance to generate the output.
102
+ To improve the quality of the extracted information, giving clear definitions for your dataclasses is especially important, e.g.
103
+
104
+ * Provide readable field names for your dataclasses.
105
+ * Provide reasonable docstrings for your dataclasses.
106
+ * For any optional fields, clearly annotate that they are optional, by `SomeType | None` or `typing.Optional[SomeType]`.
107
+
108
+ :::
109
+
110
+ Input data:
111
+
112
+ * `text` (*Str*): The text to extract information from.
113
+
114
+ Return: As specified by the `output_type` field in the spec. The extracted information from the input text.
115
+
116
+ ## EmbedText
117
+
118
+ `EmbedText` embeds a text into a vector space using various LLM APIs that support text embedding.
119
+
120
+ The spec takes the following fields:
121
+
122
+ * `api_type` ([`cocoindex.LlmApiType`](/docs/ai/llm#llm-api-types)): The type of LLM API to use for embedding.
123
+ * `model` (`str`): The name of the embedding model to use.
124
+ * `address` (`str`, optional): The address of the LLM API. If not specified, uses the default address for the API type.
125
+ * `output_dimension` (`int`, optional): The expected dimension of the output embedding vector. If not specified, use the default dimension of the model.
126
+
127
+ For most API types, the function internally keeps a registry for the default output dimension of known model.
128
+ You need to explicitly specify the `output_dimension` if you want to use a new model that is not in the registry yet.
129
+
130
+ * `task_type` (`str`, optional): The task type for embedding, used by some embedding models to optimize the embedding for specific use cases.
131
+
132
+ :::note Supported APIs for Text Embedding
133
+
134
+ Not all LLM APIs support text embedding. See the [LLM API Types table](/docs/ai/llm#llm-api-types) for which APIs support text embedding functionality.
135
+
136
+ :::
137
+
138
+ Input data:
139
+
140
+ * `text` (*Str*, required): The text to embed.
141
+
142
+ Return: *Vector[Float32, N]*, where *N* is the dimension of the embedding vector determined by the model.
@@ -13,11 +13,11 @@ The `LocalFile` source imports files from a local file system.
13
13
  ### Spec
14
14
 
15
15
  The spec takes the following fields:
16
- * `path` (type: `str`, required): full path of the root directory to import files from
17
- * `binary` (type: `bool`, optional): whether reading files as binary (instead of text)
18
- * `included_patterns` (type: `list[str]`, optional): a list of glob patterns to include files, e.g. `["*.txt", "docs/**/*.md"]`.
16
+ * `path` (`str`): full path of the root directory to import files from
17
+ * `binary` (`bool`, optional): whether reading files as binary (instead of text)
18
+ * `included_patterns` (`list[str]`, optional): a list of glob patterns to include files, e.g. `["*.txt", "docs/**/*.md"]`.
19
19
  If not specified, all files will be included.
20
- * `excluded_patterns` (type: `list[str]`, optional): a list of glob patterns to exclude files, e.g. `["tmp", "**/node_modules"]`.
20
+ * `excluded_patterns` (`list[str]`, optional): a list of glob patterns to exclude files, e.g. `["tmp", "**/node_modules"]`.
21
21
  Any file or directory matching these patterns will be excluded even if they match `included_patterns`.
22
22
  If not specified, no files will be excluded.
23
23
 
@@ -29,9 +29,9 @@ The spec takes the following fields:
29
29
 
30
30
  ### Schema
31
31
 
32
- The output is a [KTable](/docs/core/data_types#ktable) with the following sub fields:
33
- * `filename` (key, type: `str`): the filename of the file, including the path, relative to the root directory, e.g. `"dir1/file1.md"`
34
- * `content` (type: `str` if `binary` is `False`, otherwise `bytes`): the content of the file
32
+ The output is a [*KTable*](/docs/core/data_types#ktable) with the following sub fields:
33
+ * `filename` (*Str*, key): the filename of the file, including the path, relative to the root directory, e.g. `"dir1/file1.md"`
34
+ * `content` (*Str* if `binary` is `False`, *Bytes* otherwise): the content of the file
35
35
 
36
36
  ## AmazonS3
37
37
 
@@ -121,12 +121,12 @@ AWS's [Guide of Configuring a Bucket for Notifications](https://docs.aws.amazon.
121
121
  ### Spec
122
122
 
123
123
  The spec takes the following fields:
124
- * `bucket_name` (type: `str`, required): Amazon S3 bucket name.
125
- * `prefix` (type: `str`, optional): if provided, only files with path starting with this prefix will be imported.
126
- * `binary` (type: `bool`, optional): whether reading files as binary (instead of text).
127
- * `included_patterns` (type: `list[str]`, optional): a list of glob patterns to include files, e.g. `["*.txt", "docs/**/*.md"]`.
124
+ * `bucket_name` (`str`): Amazon S3 bucket name.
125
+ * `prefix` (`str`, optional): if provided, only files with path starting with this prefix will be imported.
126
+ * `binary` (`bool`, optional): whether reading files as binary (instead of text).
127
+ * `included_patterns` (`list[str]`, optional): a list of glob patterns to include files, e.g. `["*.txt", "docs/**/*.md"]`.
128
128
  If not specified, all files will be included.
129
- * `excluded_patterns` (type: `list[str]`, optional): a list of glob patterns to exclude files, e.g. `["*.tmp", "**/*.log"]`.
129
+ * `excluded_patterns` (`list[str]`, optional): a list of glob patterns to exclude files, e.g. `["*.tmp", "**/*.log"]`.
130
130
  Any file or directory matching these patterns will be excluded even if they match `included_patterns`.
131
131
  If not specified, no files will be excluded.
132
132
 
@@ -136,7 +136,7 @@ The spec takes the following fields:
136
136
 
137
137
  :::
138
138
 
139
- * `sqs_queue_url` (type: `str`, optional): if provided, the source will receive change event notifications from Amazon S3 via this SQS queue.
139
+ * `sqs_queue_url` (`str`, optional): if provided, the source will receive change event notifications from Amazon S3 via this SQS queue.
140
140
 
141
141
  :::info
142
142
 
@@ -147,9 +147,9 @@ The spec takes the following fields:
147
147
 
148
148
  ### Schema
149
149
 
150
- The output is a [KTable](/docs/core/data_types#ktable) with the following sub fields:
151
- * `filename` (key, type: `str`): the filename of the file, including the path, relative to the root directory, e.g. `"dir1/file1.md"`.
152
- * `content` (type: `str` if `binary` is `False`, otherwise `bytes`): the content of the file.
150
+ The output is a [*KTable*](/docs/core/data_types#ktable) with the following sub fields:
151
+ * `filename` (*Str*, key): the filename of the file, including the path, relative to the root directory, e.g. `"dir1/file1.md"`.
152
+ * `content` (*Str* if `binary` is `False`, otherwise *Bytes*): the content of the file.
153
153
 
154
154
 
155
155
  ## GoogleDrive
@@ -176,10 +176,10 @@ To access files in Google Drive, the `GoogleDrive` source will need to authentic
176
176
 
177
177
  The spec takes the following fields:
178
178
 
179
- * `service_account_credential_path` (type: `str`, required): full path to the service account credential file in JSON format.
180
- * `root_folder_ids` (type: `list[str]`, required): a list of Google Drive folder IDs to import files from.
181
- * `binary` (type: `bool`, optional): whether reading files as binary (instead of text).
182
- * `recent_changes_poll_interval` (type: `datetime.timedelta`, optional): when set, this source provides a change capture mechanism by polling Google Drive for recent modified files periodically.
179
+ * `service_account_credential_path` (`str`): full path to the service account credential file in JSON format.
180
+ * `root_folder_ids` (`list[str]`): a list of Google Drive folder IDs to import files from.
181
+ * `binary` (`bool`, optional): whether reading files as binary (instead of text).
182
+ * `recent_changes_poll_interval` (`datetime.timedelta`, optional): when set, this source provides a change capture mechanism by polling Google Drive for recent modified files periodically.
183
183
 
184
184
  :::info
185
185
 
@@ -198,9 +198,9 @@ The spec takes the following fields:
198
198
 
199
199
  ### Schema
200
200
 
201
- The output is a [KTable](/docs/core/data_types#ktable) with the following sub fields:
201
+ The output is a [*KTable*](/docs/core/data_types#ktable) with the following sub fields:
202
202
 
203
- * `file_id` (key, type: `str`): the ID of the file in Google Drive.
204
- * `filename` (type: `str`): the filename of the file, without the path, e.g. `"file1.md"`
205
- * `mime_type` (type: `str`): the MIME type of the file.
206
- * `content` (type: `str` if `binary` is `False`, otherwise `bytes`): the content of the file.
203
+ * `file_id` (*Str*, key): the ID of the file in Google Drive.
204
+ * `filename` (*Str*): the filename of the file, without the path, e.g. `"file1.md"`
205
+ * `mime_type` (*Str*): the MIME type of the file.
206
+ * `content` (*Str* if `binary` is `False`, otherwise *Bytes*): the content of the file.