cocoindex 0.1.40__tar.gz → 0.1.42__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (241) hide show
  1. {cocoindex-0.1.40 → cocoindex-0.1.42}/Cargo.lock +1 -1
  2. {cocoindex-0.1.40 → cocoindex-0.1.42}/Cargo.toml +1 -1
  3. {cocoindex-0.1.40 → cocoindex-0.1.42}/PKG-INFO +2 -1
  4. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/core/basics.md +10 -19
  5. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/core/data_types.mdx +1 -0
  6. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/core/flow_def.mdx +0 -1
  7. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/getting_started/quickstart.md +128 -45
  8. cocoindex-0.1.42/docs/docs/query.mdx +102 -0
  9. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/sidebars.ts +5 -0
  10. cocoindex-0.1.42/examples/code_embedding/README.md +71 -0
  11. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/code_embedding/main.py +27 -12
  12. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/image_search_example/main.py +1 -1
  13. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/product_recommendation/main.py +1 -1
  14. cocoindex-0.1.42/examples/text_embedding/README.md +63 -0
  15. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/text_embedding/main.py +26 -11
  16. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/text_embedding/pyproject.toml +5 -1
  17. cocoindex-0.1.42/examples/text_embedding_qdrant/README.md +87 -0
  18. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/text_embedding_qdrant/main.py +28 -19
  19. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/text_embedding_qdrant/pyproject.toml +1 -1
  20. {cocoindex-0.1.40 → cocoindex-0.1.42}/pyproject.toml +4 -1
  21. {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/__init__.py +5 -4
  22. cocoindex-0.1.42/python/cocoindex/cli.py +437 -0
  23. {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/flow.py +14 -7
  24. cocoindex-0.1.42/python/cocoindex/lib.py +71 -0
  25. {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/typing.py +2 -0
  26. cocoindex-0.1.42/src/base/duration.rs +674 -0
  27. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/base/json_schema.rs +11 -0
  28. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/base/mod.rs +1 -0
  29. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/base/schema.rs +4 -0
  30. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/base/value.rs +16 -1
  31. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/execution/query.rs +2 -1
  32. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/storages/neo4j.rs +14 -4
  33. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/storages/postgres.rs +12 -0
  34. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/storages/qdrant.rs +9 -2
  35. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/py/convert.rs +6 -2
  36. cocoindex-0.1.40/examples/code_embedding/README.md +0 -52
  37. cocoindex-0.1.40/examples/text_embedding/README.md +0 -46
  38. cocoindex-0.1.40/examples/text_embedding_qdrant/README.md +0 -69
  39. cocoindex-0.1.40/python/cocoindex/cli.py +0 -238
  40. cocoindex-0.1.40/python/cocoindex/lib.py +0 -78
  41. {cocoindex-0.1.40 → cocoindex-0.1.42}/.cargo/config.toml +0 -0
  42. {cocoindex-0.1.40 → cocoindex-0.1.42}/.env.lib_debug +0 -0
  43. {cocoindex-0.1.40 → cocoindex-0.1.42}/.github/ISSUE_TEMPLATE//360/237/220/233-bug-report.md" +0 -0
  44. {cocoindex-0.1.40 → cocoindex-0.1.42}/.github/ISSUE_TEMPLATE//360/237/222/241-feature-request.md" +0 -0
  45. {cocoindex-0.1.40 → cocoindex-0.1.42}/.github/scripts/update_version.sh +0 -0
  46. {cocoindex-0.1.40 → cocoindex-0.1.42}/.github/workflows/CI.yml +0 -0
  47. {cocoindex-0.1.40 → cocoindex-0.1.42}/.github/workflows/_test.yml +0 -0
  48. {cocoindex-0.1.40 → cocoindex-0.1.42}/.github/workflows/docs.yml +0 -0
  49. {cocoindex-0.1.40 → cocoindex-0.1.42}/.github/workflows/release.yml +0 -0
  50. {cocoindex-0.1.40 → cocoindex-0.1.42}/.gitignore +0 -0
  51. {cocoindex-0.1.40 → cocoindex-0.1.42}/.vscode/settings.json +0 -0
  52. {cocoindex-0.1.40 → cocoindex-0.1.42}/CODE_OF_CONDUCT.md +0 -0
  53. {cocoindex-0.1.40 → cocoindex-0.1.42}/CONTRIBUTING.md +0 -0
  54. {cocoindex-0.1.40 → cocoindex-0.1.42}/LICENSE +0 -0
  55. {cocoindex-0.1.40 → cocoindex-0.1.42}/README.md +0 -0
  56. {cocoindex-0.1.40 → cocoindex-0.1.42}/dev/neo4j.yaml +0 -0
  57. {cocoindex-0.1.40 → cocoindex-0.1.42}/dev/postgres.yaml +0 -0
  58. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/.gitignore +0 -0
  59. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/README.md +0 -0
  60. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/about/community.md +0 -0
  61. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/about/contributing.md +0 -0
  62. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/ai/llm.mdx +0 -0
  63. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/core/cli.mdx +0 -0
  64. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/core/custom_function.mdx +0 -0
  65. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/core/data_example.svg +0 -0
  66. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/core/flow_example.svg +0 -0
  67. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/core/flow_methods.mdx +0 -0
  68. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/core/initialization.mdx +0 -0
  69. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/getting_started/installation.md +0 -0
  70. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/getting_started/markdown_files.zip +0 -0
  71. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/getting_started/overview.md +0 -0
  72. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/ops/functions.md +0 -0
  73. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/ops/sources.md +0 -0
  74. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/ops/storages.md +0 -0
  75. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docusaurus.config.ts +0 -0
  76. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/package.json +0 -0
  77. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/src/components/HomepageFeatures/index.tsx +0 -0
  78. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/src/components/HomepageFeatures/styles.module.css +0 -0
  79. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/src/css/custom.css +0 -0
  80. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/src/theme/Root.js +0 -0
  81. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/static/.nojekyll +0 -0
  82. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/static/img/docusaurus.png +0 -0
  83. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/static/img/favicon.ico +0 -0
  84. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/static/img/icon.svg +0 -0
  85. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/static/robots.txt +0 -0
  86. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/tsconfig.json +0 -0
  87. {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/yarn.lock +0 -0
  88. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/amazon_s3_embedding/.env.example +0 -0
  89. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/amazon_s3_embedding/.gitignore +0 -0
  90. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/amazon_s3_embedding/README.md +0 -0
  91. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/amazon_s3_embedding/main.py +0 -0
  92. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/amazon_s3_embedding/pyproject.toml +0 -0
  93. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/code_embedding/.env +0 -0
  94. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/code_embedding/pyproject.toml +0 -0
  95. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/docs_to_knowledge_graph/.env +0 -0
  96. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/docs_to_knowledge_graph/README.md +0 -0
  97. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/docs_to_knowledge_graph/main.py +0 -0
  98. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/docs_to_knowledge_graph/pyproject.toml +0 -0
  99. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/fastapi_server_docker/.dockerignore +0 -0
  100. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/fastapi_server_docker/.env +0 -0
  101. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/fastapi_server_docker/README.md +0 -0
  102. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/fastapi_server_docker/compose.yaml +0 -0
  103. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/fastapi_server_docker/dockerfile +0 -0
  104. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/fastapi_server_docker/main.py +0 -0
  105. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/fastapi_server_docker/requirements.txt +0 -0
  106. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/fastapi_server_docker/sample_code/main.py +0 -0
  107. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/fastapi_server_docker/src/cocoindex_funs.py +0 -0
  108. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/gdrive_text_embedding/.env.example +0 -0
  109. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/gdrive_text_embedding/.gitignore +0 -0
  110. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/gdrive_text_embedding/README.md +0 -0
  111. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/gdrive_text_embedding/main.py +0 -0
  112. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/gdrive_text_embedding/pyproject.toml +0 -0
  113. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/image_search_example/.env +0 -0
  114. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/image_search_example/README.md +0 -0
  115. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/image_search_example/frontend/.gitignore +0 -0
  116. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/image_search_example/frontend/index.html +0 -0
  117. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/image_search_example/frontend/package-lock.json +0 -0
  118. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/image_search_example/frontend/package.json +0 -0
  119. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/image_search_example/frontend/src/App.jsx +0 -0
  120. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/image_search_example/frontend/src/main.jsx +0 -0
  121. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/image_search_example/frontend/src/style.css +0 -0
  122. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/image_search_example/frontend/vite.config.js +0 -0
  123. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/image_search_example/img/cat1.jpeg +0 -0
  124. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/image_search_example/img/dog1.jpeg +0 -0
  125. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/image_search_example/img/elephant1.jpg +0 -0
  126. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/image_search_example/img/giraffe.jpg +0 -0
  127. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/image_search_example/requirements.txt +0 -0
  128. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/manuals_llm_extraction/.env +0 -0
  129. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/manuals_llm_extraction/README.md +0 -0
  130. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/manuals_llm_extraction/main.py +0 -0
  131. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/manuals_llm_extraction/manuals/array.pdf +0 -0
  132. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/manuals_llm_extraction/manuals/base64.pdf +0 -0
  133. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/manuals_llm_extraction/manuals/copy.pdf +0 -0
  134. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/manuals_llm_extraction/manuals/glob.pdf +0 -0
  135. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/manuals_llm_extraction/pyproject.toml +0 -0
  136. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/pdf_embedding/.env +0 -0
  137. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/pdf_embedding/README.md +0 -0
  138. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/pdf_embedding/main.py +0 -0
  139. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/pdf_embedding/pdf_files/1706.03762v7.pdf +0 -0
  140. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/pdf_embedding/pdf_files/1810.04805v2.pdf +0 -0
  141. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/pdf_embedding/pdf_files/rfc8259.pdf +0 -0
  142. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/pdf_embedding/pyproject.toml +0 -0
  143. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/product_recommendation/.env +0 -0
  144. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/product_recommendation/README.md +0 -0
  145. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/product_recommendation/img/cocoinsight.png +0 -0
  146. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/product_recommendation/img/neo4j.png +0 -0
  147. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/product_recommendation/products/p1.json +0 -0
  148. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/product_recommendation/products/p2.json +0 -0
  149. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/product_recommendation/products/p3.json +0 -0
  150. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/product_recommendation/products/p4.json +0 -0
  151. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/product_recommendation/products/p5.json +0 -0
  152. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/product_recommendation/products/p6.json +0 -0
  153. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/product_recommendation/products/p7.json +0 -0
  154. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/product_recommendation/products/p8.json +0 -0
  155. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/product_recommendation/products/p9.json +0 -0
  156. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/product_recommendation/pyproject.toml +0 -0
  157. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/text_embedding/.env +0 -0
  158. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/text_embedding/Text_Embedding.ipynb +0 -0
  159. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/text_embedding/markdown_files/1706.03762v7.md +0 -0
  160. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/text_embedding/markdown_files/1810.04805v2.md +0 -0
  161. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/text_embedding/markdown_files/rfc8259.md +0 -0
  162. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/text_embedding_qdrant/.env +0 -0
  163. {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/text_embedding_qdrant/markdown_files/rfc8259.md +0 -0
  164. {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/auth_registry.py +0 -0
  165. {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/convert.py +0 -0
  166. {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/functions.py +0 -0
  167. {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/index.py +0 -0
  168. {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/llm.py +0 -0
  169. {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/op.py +0 -0
  170. {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/py.typed +0 -0
  171. {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/query.py +0 -0
  172. {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/runtime.py +0 -0
  173. {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/setting.py +0 -0
  174. {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/setup.py +0 -0
  175. {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/sources.py +0 -0
  176. {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/storages.py +0 -0
  177. {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/tests/__init__.py +0 -0
  178. {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/tests/test_convert.py +0 -0
  179. {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/utils.py +0 -0
  180. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/base/field_attrs.rs +0 -0
  181. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/base/spec.rs +0 -0
  182. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/builder/analyzed_flow.rs +0 -0
  183. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/builder/analyzer.rs +0 -0
  184. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/builder/flow_builder.rs +0 -0
  185. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/builder/mod.rs +0 -0
  186. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/builder/plan.rs +0 -0
  187. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/execution/db_tracking.rs +0 -0
  188. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/execution/db_tracking_setup.rs +0 -0
  189. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/execution/dumper.rs +0 -0
  190. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/execution/evaluator.rs +0 -0
  191. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/execution/indexing_status.rs +0 -0
  192. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/execution/live_updater.rs +0 -0
  193. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/execution/memoization.rs +0 -0
  194. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/execution/mod.rs +0 -0
  195. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/execution/row_indexer.rs +0 -0
  196. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/execution/source_indexer.rs +0 -0
  197. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/execution/stats.rs +0 -0
  198. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/lib.rs +0 -0
  199. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/lib_context.rs +0 -0
  200. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/llm/anthropic.rs +0 -0
  201. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/llm/gemini.rs +0 -0
  202. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/llm/mod.rs +0 -0
  203. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/llm/ollama.rs +0 -0
  204. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/llm/openai.rs +0 -0
  205. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/factory_bases.rs +0 -0
  206. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/functions/extract_by_llm.rs +0 -0
  207. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/functions/mod.rs +0 -0
  208. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/functions/parse_json.rs +0 -0
  209. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/functions/split_recursively.rs +0 -0
  210. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/interface.rs +0 -0
  211. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/mod.rs +0 -0
  212. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/py_factory.rs +0 -0
  213. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/registration.rs +0 -0
  214. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/registry.rs +0 -0
  215. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/sdk.rs +0 -0
  216. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/sources/amazon_s3.rs +0 -0
  217. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/sources/google_drive.rs +0 -0
  218. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/sources/local_file.rs +0 -0
  219. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/sources/mod.rs +0 -0
  220. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/storages/mod.rs +0 -0
  221. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/storages/spec.rs +0 -0
  222. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/prelude.rs +0 -0
  223. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/py/mod.rs +0 -0
  224. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/server.rs +0 -0
  225. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/service/error.rs +0 -0
  226. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/service/flows.rs +0 -0
  227. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/service/mod.rs +0 -0
  228. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/service/search.rs +0 -0
  229. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/settings.rs +0 -0
  230. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/setup/auth_registry.rs +0 -0
  231. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/setup/components.rs +0 -0
  232. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/setup/db_metadata.rs +0 -0
  233. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/setup/driver.rs +0 -0
  234. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/setup/mod.rs +0 -0
  235. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/setup/states.rs +0 -0
  236. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/utils/db.rs +0 -0
  237. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/utils/fingerprint.rs +0 -0
  238. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/utils/immutable.rs +0 -0
  239. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/utils/mod.rs +0 -0
  240. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/utils/retryable.rs +0 -0
  241. {cocoindex-0.1.40 → cocoindex-0.1.42}/src/utils/yaml_ser.rs +0 -0
@@ -993,7 +993,7 @@ dependencies = [
993
993
 
994
994
  [[package]]
995
995
  name = "cocoindex"
996
- version = "0.1.40"
996
+ version = "0.1.42"
997
997
  dependencies = [
998
998
  "anyhow",
999
999
  "async-openai",
@@ -2,7 +2,7 @@
2
2
  name = "cocoindex"
3
3
  # Version used for local development is always higher than others to take precedence.
4
4
  # Will be overridden for specific release versions.
5
- version = "0.1.40"
5
+ version = "0.1.42"
6
6
  edition = "2024"
7
7
 
8
8
  [profile.release]
@@ -1,9 +1,10 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cocoindex
3
- Version: 0.1.40
3
+ Version: 0.1.42
4
4
  Requires-Dist: sentence-transformers>=3.3.1
5
5
  Requires-Dist: click>=8.1.8
6
6
  Requires-Dist: rich>=14.0.0
7
+ Requires-Dist: python-dotenv>=1.1.0
7
8
  Requires-Dist: pytest ; extra == 'test'
8
9
  Provides-Extra: test
9
10
  License-File: LICENSE
@@ -1,17 +1,17 @@
1
1
  ---
2
- title: Basics
3
- description: "CocoIndex basic concepts: indexing flow, data, operations, data updates, etc."
2
+ title: Indexing Basics
3
+ description: "CocoIndex basic concepts for indexing: indexing flow, data, operations, data updates, etc."
4
4
  ---
5
5
 
6
- # CocoIndex Basics
6
+ # CocoIndex Indexing Basics
7
7
 
8
8
  An **index** is a collection of data stored in a way that is easy for retrieval.
9
9
 
10
- CocoIndex is an ETL framework for building indexes from specified data sources, a.k.a. indexing. It also offers utilities for users to retrieve data from the indexes.
10
+ CocoIndex is an ETL framework for building indexes from specified data sources, a.k.a. **indexing**. It also offers utilities for users to retrieve data from the indexes.
11
11
 
12
- ## Indexing flow
12
+ An **indexing flow** extracts data from specified data sources, upon specified transformations, and puts the transformed data into specified storage for later retrieval.
13
13
 
14
- An indexing flow extracts data from specified data sources, upon specified transformations, and puts the transformed data into specified storage for later retrieval.
14
+ ## Indexing flow elements
15
15
 
16
16
  An indexing flow has two aspects: data and operations on data.
17
17
 
@@ -42,7 +42,7 @@ An **operation** in an indexing flow defines a step in the flow. An operation is
42
42
 
43
43
  "import" and "transform" operations produce output data, whose data type is determined based on the operation spec and data types of input data (for "transform" operation only).
44
44
 
45
- ### Example
45
+ ## An indexing flow example
46
46
 
47
47
  For the example shown in the [Quickstart](../getting_started/quickstart) section, the indexing flow is as follows:
48
48
 
@@ -60,7 +60,7 @@ This shows schema and example data for the indexing flow:
60
60
 
61
61
  ![Data Example](data_example.svg)
62
62
 
63
- ### Life cycle of an indexing flow
63
+ ## Life cycle of an indexing flow
64
64
 
65
65
  An indexing flow, once set up, maintains a long-lived relationship between data source and data in target storage. This means:
66
66
 
@@ -95,19 +95,10 @@ CocoIndex works the same way, but with more powerful capabilities:
95
95
 
96
96
  This means when writing your flow operations, you can treat source data as if it were static - focusing purely on defining the transformation logic. CocoIndex takes care of maintaining the dynamic relationship between sources and target data behind the scenes.
97
97
 
98
- ### Internal storage
98
+ ## Internal storage
99
99
 
100
100
  As an indexing flow is long-lived, it needs to store intermediate data to keep track of the states.
101
101
  CocoIndex uses internal storage for this purpose.
102
102
 
103
103
  Currently, CocoIndex uses Postgres database as the internal storage.
104
- See [Initialization](initialization) for configuring its location, and `cocoindex setup` CLI command (see [CocoIndex CLI](cli)) creates tables for the internal storage.
105
-
106
- ## Retrieval
107
-
108
- There are two ways to retrieve data from target storage built by an indexing flow:
109
-
110
- * Query the underlying target storage directly for maximum flexibility.
111
- * Use CocoIndex *query handlers* for a more convenient experience with built-in tooling support (e.g. CocoInsight) to understand query performance against the target data.
112
-
113
- Query handlers are tied to specific indexing flows. They accept query inputs, transform them by defined operations, and retrieve matching data from the target storage that was created by the flow.
104
+ See [Initialization](initialization) for configuring its location, and `cocoindex setup` CLI command (see [CocoIndex CLI](cli)) creates tables for the internal storage.
@@ -35,6 +35,7 @@ This is the list of all basic types supported by CocoIndex:
35
35
  | Time | | `datetime.time` | `datetime.time` |
36
36
  | LocalDatetime | Date and time without timezone | `cocoindex.LocalDateTime` | `datetime.datetime` |
37
37
  | OffsetDatetime | Date and time with a timezone offset | `cocoindex.OffsetDateTime` | `datetime.datetime` |
38
+ | TimeDelta | A duration of time | `datetime.timedelta` | `datetime.timedelta` |
38
39
  | Vector[*T*, *Dim*?] | *T* must be basic type. *Dim* is a positive integer and optional. |`cocoindex.Vector[T]` or `cocoindex.Vector[T, Dim]` | `list[T]` |
39
40
  | Json | | `cocoindex.Json` | Any data convertible to JSON by `json` package |
40
41
 
@@ -1,7 +1,6 @@
1
1
  ---
2
2
  title: Flow Definition
3
3
  description: Define a CocoIndex flow, by specifying source, transformations and storages, and connect input/output data of them.
4
- toc_max_heading_level: 4
5
4
  ---
6
5
 
7
6
  import Tabs from '@theme/Tabs';
@@ -54,11 +54,7 @@ Create a new file `quickstart.py` and import the `cocoindex` library:
54
54
  import cocoindex
55
55
  ```
56
56
 
57
- Then we'll put the following pieces into the file:
58
-
59
- * Define an indexing flow, which specifies a data flow to transform data from specified data source into a vector index.
60
- * Define a query handler, which can be used to query the vector index.
61
- * A main function, to interact with users and run queries using the query handler above.
57
+ Then we'll create the indexing flow.
62
58
 
63
59
  ### Step 2.1: Define the indexing flow
64
60
 
@@ -121,46 +117,14 @@ Notes:
121
117
 
122
118
  6. In CocoIndex, a *collector* collects multiple entries of data together. In this example, the `doc_embeddings` collector collects data from all `chunk`s across all `doc`s, and using the collected data to build a vector index `"doc_embeddings"`, using `Postgres`.
123
119
 
124
- ### Step 2.2: Define the query handler
125
-
126
- Starting from the query handler:
127
-
128
- ```python title="quickstart.py"
129
- query_handler = cocoindex.query.SimpleSemanticsQueryHandler(
130
- name="SemanticsSearch",
131
- flow=text_embedding_flow,
132
- target_name="doc_embeddings",
133
- query_transform_flow=lambda text: text.transform(
134
- cocoindex.functions.SentenceTransformerEmbed(
135
- model="sentence-transformers/all-MiniLM-L6-v2")),
136
- default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)
137
- ```
138
-
139
- This handler queries the vector index `"doc_embeddings"`, and uses the same embedding model `"sentence-transformers/all-MiniLM-L6-v2"` to transform query text into vectors for similarity matching.
140
-
141
-
142
- ### Step 2.3: Define the main function
120
+ ### Step 2.2: Define the main function
143
121
 
144
- The main function is used to interact with users and run queries using the query handler above.
122
+ We can provide an empty main function for now, with a `@cocoindex.main_fn()` decorator:
145
123
 
146
124
  ```python title="quickstart.py"
147
125
  @cocoindex.main_fn()
148
126
  def _main():
149
- # Run queries to demonstrate the query capabilities.
150
- while True:
151
- try:
152
- query = input("Enter search query (or Enter to quit): ")
153
- if query == '':
154
- break
155
- results, _ = query_handler.search(query, 10)
156
- print("\nSearch results:")
157
- for result in results:
158
- print(f"[{result.score:.3f}] {result.data['filename']}")
159
- print(f" {result.data['text']}")
160
- print("---")
161
- print()
162
- except KeyboardInterrupt:
163
- break
127
+ pass
164
128
 
165
129
  if __name__ == "__main__":
166
130
  _main()
@@ -168,10 +132,9 @@ if __name__ == "__main__":
168
132
 
169
133
  The `@cocoindex.main_fn` declares a function as the main function for an indexing application. This achieves the following effects:
170
134
 
171
- * Initialize the CocoIndex librart states. Settings (e.g. database URL) are loaded from environment variables by default.
135
+ * Initialize the CocoIndex library states. Settings (e.g. database URL) are loaded from environment variables by default.
172
136
  * When the CLI is invoked with `cocoindex` subcommand, `cocoindex CLI` takes over the control, which provides convenient ways to manage the index. See the next step for more details.
173
137
 
174
-
175
138
  ## Step 3: Run the indexing pipeline and queries
176
139
 
177
140
  Specify the database URL by environment variable:
@@ -206,9 +169,129 @@ It will run for a few seconds and output the following statistics:
206
169
  documents: 3 added, 0 removed, 0 updated
207
170
  ```
208
171
 
209
- ### Step 3.3: Run queries against the index
172
+ ## Step 4 (optional): Run queries against the index
173
+
174
+ CocoIndex excels at transforming your data and storing it (a.k.a. indexing).
175
+ The goal of transforming your data is usually to query against it.
176
+ Once you already have your index built, you can directly access the transformed data in the target database.
177
+ CocoIndex also provides utilities for you to do this more seamlessly.
178
+
179
+ In this example, we'll use the [`psycopg` library](https://www.psycopg.org/) to connect to the database and run queries.
180
+ Please make sure it's installed:
181
+
182
+ ```bash
183
+ pip install psycopg[binary,pool]
184
+ ```
185
+
186
+ ### Step 4.1: Extract common transformations
187
+
188
+ Between your indexing flow and the query logic, one piece of transformation is shared: compute the embedding of a text.
189
+ i.e. they should use exactly the same embedding model and parameters.
190
+
191
+ Let's extract that into a function:
192
+
193
+ ```python title="quickstart.py"
194
+ @cocoindex.transform_flow()
195
+ def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]:
196
+ return text.transform(
197
+ cocoindex.functions.SentenceTransformerEmbed(
198
+ model="sentence-transformers/all-MiniLM-L6-v2"))
199
+ ```
200
+
201
+ `cocoindex.DataSlice[str]` represents certain data in the flow (e.g. a field in a data scope), with type `str` at runtime.
202
+ Similar to the `text_embedding_flow()` above, the `text_to_embedding()` is also to constructing the flow instead of directly doing computation,
203
+ so the type it takes is `cocoindex.DataSlice[str]` instead of `str`.
204
+ See [Data Slice](../core/flow_def#data-slice) for more details.
205
+
206
+
207
+ Then the corresponding code in the indexing flow can be simplified by calling this function:
208
+
209
+ ```python title="quickstart.py"
210
+ ...
211
+ # Transform data of each chunk
212
+ with doc["chunks"].row() as chunk:
213
+ # Embed the chunk, put into `embedding` field
214
+ chunk["embedding"] = text_to_embedding(chunk["text"])
215
+
216
+ # Collect the chunk into the collector.
217
+ doc_embeddings.collect(filename=doc["filename"], location=chunk["location"],
218
+ text=chunk["text"], embedding=chunk["embedding"])
219
+ ...
220
+ ```
221
+
222
+ The function decorator `@cocoindex.transform_flow()` is used to declare a function as a CocoIndex transform flow,
223
+ i.e., a sub flow only performing transformations, without importing data from sources or exporting data to targets.
224
+ The decorator is needed for evaluating the flow with specific input data in Step 4.2 below.
225
+
226
+ ### Step 4.2: Provide the query logic
227
+
228
+ Now we can create a function to query the index upon a given input query:
229
+
230
+ ```python title="quickstart.py"
231
+ from psycopg_pool import ConnectionPool
232
+
233
+ def search(pool: ConnectionPool, query: str, top_k: int = 5):
234
+ # Get the table name, for the export target in the text_embedding_flow above.
235
+ table_name = cocoindex.utils.get_target_storage_default_name(text_embedding_flow, "doc_embeddings")
236
+ # Evaluate the transform flow defined above with the input query, to get the embedding.
237
+ query_vector = text_to_embedding.eval(query)
238
+ # Run the query and get the results.
239
+ with pool.connection() as conn:
240
+ with conn.cursor() as cur:
241
+ cur.execute(f"""
242
+ SELECT filename, text, embedding <=> %s::vector AS distance
243
+ FROM {table_name} ORDER BY distance LIMIT %s
244
+ """, (query_vector, top_k))
245
+ return [
246
+ {"filename": row[0], "text": row[1], "score": 1.0 - row[2]}
247
+ for row in cur.fetchall()
248
+ ]
249
+ ```
250
+
251
+ In the function above, most parts are standard query logic - you can use any libraries you like.
252
+ There're two CocoIndex-specific logic:
253
+
254
+ 1. Get the table name from the export target in the `text_embedding_flow` above.
255
+ Since the table name for the `Postgres` target is not explicitly specified in the `export()` call,
256
+ CocoIndex uses a default name.
257
+ `cocoindex.utils.get_target_storage_default_name()` is a utility function to get the default table name for this case.
258
+
259
+ 2. Evaluate the transform flow defined above with the input query, to get the embedding.
260
+ It's done by the `eval()` method of the transform flow `text_to_embedding`.
261
+ The return type of this method is `list[float]` as declared in the `text_to_embedding()` function (`cocoindex.DataSlice[list[float]]`).
262
+
263
+ ### Step 4.3: Update the main function
264
+
265
+ Now we can update the main function to use the query function we just defined:
266
+
267
+ ```python title="quickstart.py"
268
+ @cocoindex.main_fn()
269
+ def _run():
270
+ # Initialize the database connection pool.
271
+ pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL"))
272
+ # Run queries in a loop to demonstrate the query capabilities.
273
+ while True:
274
+ try:
275
+ query = input("Enter search query (or Enter to quit): ")
276
+ if query == '':
277
+ break
278
+ # Run the query function with the database connection pool and the query.
279
+ results = search(pool, query)
280
+ print("\nSearch results:")
281
+ for result in results:
282
+ print(f"[{result['score']:.3f}] {result['filename']}")
283
+ print(f" {result['text']}")
284
+ print("---")
285
+ print()
286
+ except KeyboardInterrupt:
287
+ break
288
+ ```
289
+
290
+ It interacts with users and search the database by calling the `search()` method created in Step 4.2.
291
+
292
+ ### Step 4.4: Run queries against the index
210
293
 
211
- Now we have the index built. We can run the same Python file without additional arguments, which will run the main function defined in Step 2.3:
294
+ Now we can run the same Python file, which will run the new main function:
212
295
 
213
296
  ```bash
214
297
  python quickstart.py
@@ -222,5 +305,5 @@ Next, you may want to:
222
305
 
223
306
  * Learn about [CocoIndex Basics](../core/basics.md).
224
307
  * Learn about other examples in the [examples](https://github.com/cocoindex-io/cocoindex/tree/main/examples) directory.
225
- * The `text_embedding` example is this quickstart with some polishing (loading environment variables from `.env` file, extract pieces shared by the indexing flow and query handler into a function).
308
+ * The `text_embedding` example is this quickstart.
226
309
  * Pick other examples to learn upon your interest.
@@ -0,0 +1,102 @@
1
+ ---
2
+ title: Query Support
3
+ description: CocoIndex supports vector search and text search.
4
+ ---
5
+
6
+ import Tabs from '@theme/Tabs';
7
+ import TabItem from '@theme/TabItem';
8
+
9
+ # CocoIndex Query Support
10
+
11
+ The main functionality of CocoIndex is indexing.
12
+ The goal of indexing is to enable efficient querying against your data.
13
+ You can use any libraries or frameworks of your choice to perform queries.
14
+ At the same time, CocoIndex provides seamless integration between indexing and querying workflows.
15
+ For example, you can share transformations between indexing and querying, and easily retrieve table names when using CocoIndex's default naming conventions.
16
+
17
+ ## Transform Flow
18
+
19
+ Sometimes a part of the transformation logic needs to be shared between indexing and querying,
20
+ e.g. when we build a vector index and query against it, the embedding computation needs to be consistent between indexing and querying.
21
+
22
+ In this case, you can:
23
+
24
+ 1. Extract a sub-flow with the shared transformation logic into a standalone function.
25
+ * It takes one or more data slices as input.
26
+ * It returns one data slice as output.
27
+ * You need to annotate data types for both inputs and outputs as type parameter for `cocoindex.DataSlice[T]`. See [data types](./core/data_types.mdx) for more details about supported data types.
28
+
29
+ 2. When you're defining your indexing flow, you can directly call the function.
30
+ The body will be executed, so that the transformation logic will be added as part of the indexing flow.
31
+
32
+ 3. At query time, you usually want to directly run the function with specific input data, instead of letting it called as part of a long-lived indexing flow.
33
+ To do this, declare the function as a *transform flow*, by decorating it with `@cocoindex.transform_flow()`.
34
+ This will add `eval()` and `eval_async()` methods to the function, so that you can directly call with specific input data.
35
+
36
+
37
+ <Tabs>
38
+ <TabItem value="python" label="Python">
39
+
40
+ The [quickstart](getting_started/quickstart#step-41-extract-common-transformations) shows an example:
41
+
42
+ ```python
43
+ @cocoindex.transform_flow()
44
+ def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]:
45
+ return text.transform(
46
+ cocoindex.functions.SentenceTransformerEmbed(
47
+ model="sentence-transformers/all-MiniLM-L6-v2"))
48
+ ```
49
+
50
+ When you're defining your indexing flow, you can directly call the function:
51
+
52
+ ```python
53
+ with doc["chunks"].row() as chunk:
54
+ chunk["embedding"] = text_to_embedding(chunk["text"])
55
+ ```
56
+
57
+ or, using the `call()` method of the transform flow on the first argument, to make operations chainable:
58
+
59
+ ```python
60
+ with doc["chunks"].row() as chunk:
61
+ chunk["embedding"] = chunk["text"].call(text_to_embedding)
62
+ ```
63
+
64
+ Any time, you can call the `eval()` method with specific string, which will return a `list[float]`:
65
+
66
+ ```python
67
+ print(text_to_embedding.eval("Hello, world!"))
68
+ ```
69
+
70
+ If you're in an async context, please call the `eval_async()` method instead:
71
+
72
+ ```python
73
+ print(await text_to_embedding.eval_async("Hello, world!"))
74
+ ```
75
+
76
+ </TabItem>
77
+ </Tabs>
78
+
79
+ ## Get Target Native Names
80
+
81
+ In your indexing flow, when you export data to a target, you can specify the target name (e.g. a database table name, a collection name, the node label in property graph databases, etc.) explicitly,
82
+ or for some backends you can also omit it and let CocoIndex generate a default name for you.
83
+ For the latter case, CocoIndex provides a utility function `cocoindex.utils.get_target_storage_default_name()` to get the default name.
84
+ It takes the following arguments:
85
+
86
+ * `flow` (type: `cocoindex.Flow`): The flow to get the default name for.
87
+ * `target_name` (type: `str`): The export target name, appeared in the `export()` call.
88
+
89
+ For example:
90
+
91
+ <Tabs>
92
+ <TabItem value="python" label="Python">
93
+
94
+ ```python
95
+ table_name = cocoindex.utils.get_target_storage_default_name(text_embedding_flow, "doc_embeddings")
96
+ query = f"SELECT filename, text FROM {table_name} ORDER BY embedding <=> %s::vector DESC LIMIT 5"
97
+ ...
98
+ ```
99
+
100
+ </TabItem>
101
+ </Tabs>
102
+
@@ -44,6 +44,11 @@ const sidebars: SidebarsConfig = {
44
44
  'ai/llm',
45
45
  ],
46
46
  },
47
+ {
48
+ type: 'doc',
49
+ id: 'query',
50
+ label: 'Query Support',
51
+ },
47
52
  {
48
53
  type: 'category',
49
54
  label: 'About',
@@ -0,0 +1,71 @@
1
+ # Build real-time index for codebase
2
+ [![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex)
3
+
4
+ CocoIndex provides built-in support for code base chunking, using Tree-sitter to keep syntax boundary. In this example, we will build real-time index for codebase using CocoIndex.
5
+
6
+ We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful.
7
+
8
+ ![Build embedding index for codebase](https://github.com/user-attachments/assets/6dc5ce89-c949-41d4-852f-ad95af163dbd)
9
+
10
+ [Tree-sitter](https://en.wikipedia.org/wiki/Tree-sitter_%28parser_generator%29) is a parser generator tool and an incremental parsing library. It is available in Rust 🦀 - [GitHub](https://github.com/tree-sitter/tree-sitter). CocoIndex has built-in Rust integration with Tree-sitter to efficiently parse code and extract syntax trees for various programming languages. Check out the list of supported languages [here](https://cocoindex.io/docs/ops/functions#splitrecursively) - in the `language` section.
11
+
12
+
13
+ ## Tutorials
14
+ - Step by step tutorial - Check out the [blog](https://cocoindex.io/blogs/index-code-base-for-rag).
15
+ - Video tutorial - [Youtube](https://youtu.be/G3WstvhHO24?si=Bnxu67Ax5Lv8b-J2).
16
+
17
+ ## Steps
18
+
19
+ ### Indexing Flow
20
+ <p align='center'>
21
+ <img width="434" alt="Screenshot 2025-05-19 at 10 14 36 PM" src="https://github.com/user-attachments/assets/3a506034-698f-480a-b653-22184dae4e14" />
22
+ </p>
23
+
24
+ 1. We will ingest CocoIndex codebase.
25
+ 2. For each file, perform chunking (Tree-sitter) and then embedding.
26
+ 3. We will save the embeddings and the metadata in Postgres with PGVector.
27
+
28
+ ### Query:
29
+ We will match against user-provided text by a SQL query, reusing the embedding operation in the indexing flow.
30
+
31
+
32
+ ## Prerequisite
33
+ [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one.
34
+
35
+ ## Run
36
+
37
+ - Install dependencies:
38
+ ```bash
39
+ pip install -e .
40
+ ```
41
+
42
+ - Setup:
43
+
44
+ ```bash
45
+ python main.py cocoindex setup
46
+ ```
47
+
48
+ - Update index:
49
+
50
+ ```bash
51
+ python main.py cocoindex update
52
+ ```
53
+
54
+ - Run:
55
+
56
+ ```bash
57
+ python main.py
58
+ ```
59
+
60
+ ## CocoInsight
61
+ I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline.
62
+ It just connects to your local CocoIndex server, with Zero pipeline data retention. Run the following command to start CocoInsight:
63
+
64
+ ```
65
+ python main.py cocoindex server -ci
66
+ ```
67
+
68
+ Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight).
69
+
70
+ <img width="1305" alt="Chunking Visualization" src="https://github.com/user-attachments/assets/8e83b9a4-2bed-456b-83e5-b5381b28b84a" />
71
+
@@ -1,5 +1,5 @@
1
1
  from dotenv import load_dotenv
2
-
2
+ from psycopg_pool import ConnectionPool
3
3
  import cocoindex
4
4
  import os
5
5
 
@@ -8,7 +8,8 @@ def extract_extension(filename: str) -> str:
8
8
  """Extract the extension of a filename."""
9
9
  return os.path.splitext(filename)[1]
10
10
 
11
- def code_to_embedding(text: cocoindex.DataSlice) -> cocoindex.DataSlice:
11
+ @cocoindex.transform_flow()
12
+ def code_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]:
12
13
  """
13
14
  Embed the text using a SentenceTransformer model.
14
15
  """
@@ -24,7 +25,7 @@ def code_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind
24
25
  data_scope["files"] = flow_builder.add_source(
25
26
  cocoindex.sources.LocalFile(path="../..",
26
27
  included_patterns=["*.py", "*.rs", "*.toml", "*.md", "*.mdx"],
27
- excluded_patterns=[".*", "target", "**/node_modules"]))
28
+ excluded_patterns=["**/.*", "target", "**/node_modules"]))
28
29
  code_embeddings = data_scope.add_collector()
29
30
 
30
31
  with data_scope["files"].row() as file:
@@ -47,26 +48,40 @@ def code_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind
47
48
  metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
48
49
 
49
50
 
50
- query_handler = cocoindex.query.SimpleSemanticsQueryHandler(
51
- name="SemanticsSearch",
52
- flow=code_embedding_flow,
53
- target_name="code_embeddings",
54
- query_transform_flow=code_to_embedding,
55
- default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)
51
+
52
+ def search(pool: ConnectionPool, query: str, top_k: int = 5):
53
+ # Get the table name, for the export target in the code_embedding_flow above.
54
+ table_name = cocoindex.utils.get_target_storage_default_name(code_embedding_flow, "code_embeddings")
55
+ # Evaluate the transform flow defined above with the input query, to get the embedding.
56
+ query_vector = code_to_embedding.eval(query)
57
+ # Run the query and get the results.
58
+ with pool.connection() as conn:
59
+ with conn.cursor() as cur:
60
+ cur.execute(f"""
61
+ SELECT filename, code, embedding <=> %s::vector AS distance
62
+ FROM {table_name} ORDER BY distance LIMIT %s
63
+ """, (query_vector, top_k))
64
+ return [
65
+ {"filename": row[0], "code": row[1], "score": 1.0 - row[2]}
66
+ for row in cur.fetchall()
67
+ ]
56
68
 
57
69
  @cocoindex.main_fn()
58
70
  def _run():
71
+ # Initialize the database connection pool.
72
+ pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL"))
59
73
  # Run queries in a loop to demonstrate the query capabilities.
60
74
  while True:
61
75
  try:
62
76
  query = input("Enter search query (or Enter to quit): ")
63
77
  if query == '':
64
78
  break
65
- results, _ = query_handler.search(query, 10)
79
+ # Run the query function with the database connection pool and the query.
80
+ results = search(pool, query)
66
81
  print("\nSearch results:")
67
82
  for result in results:
68
- print(f"[{result.score:.3f}] {result.data['filename']}")
69
- print(f" {result.data['code']}")
83
+ print(f"[{result['score']:.3f}] {result['filename']}")
84
+ print(f" {result['code']}")
70
85
  print("---")
71
86
  print()
72
87
  except KeyboardInterrupt:
@@ -94,7 +94,7 @@ app.mount("/img", StaticFiles(directory="img"), name="img")
94
94
  # --- CocoIndex initialization on startup ---
95
95
  @app.on_event("startup")
96
96
  def startup_event():
97
- settings = cocoindex.setting.Settings.from_env()
97
+ settings = cocoindex.Settings.from_env()
98
98
  cocoindex.init(settings)
99
99
  app.state.query_handler = cocoindex.query.SimpleSemanticsQueryHandler(
100
100
  name="ImageObjectSearch",
@@ -61,7 +61,7 @@ class ProductTaxonomyInfo:
61
61
  complementary_taxonomies: list[ProductTaxonomy]
62
62
 
63
63
  @cocoindex.op.function(behavior_version=2)
64
- def extract_product_info(product: cocoindex.typing.Json, filename: str) -> ProductInfo:
64
+ def extract_product_info(product: cocoindex.Json, filename: str) -> ProductInfo:
65
65
  # Print markdown for LLM to extract the taxonomy and complimentary taxonomy
66
66
  return ProductInfo(
67
67
  id=f"{filename.removesuffix('.json')}",