cocoindex 0.1.40__tar.gz → 0.1.42__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cocoindex-0.1.40 → cocoindex-0.1.42}/Cargo.lock +1 -1
- {cocoindex-0.1.40 → cocoindex-0.1.42}/Cargo.toml +1 -1
- {cocoindex-0.1.40 → cocoindex-0.1.42}/PKG-INFO +2 -1
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/core/basics.md +10 -19
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/core/data_types.mdx +1 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/core/flow_def.mdx +0 -1
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/getting_started/quickstart.md +128 -45
- cocoindex-0.1.42/docs/docs/query.mdx +102 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/sidebars.ts +5 -0
- cocoindex-0.1.42/examples/code_embedding/README.md +71 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/code_embedding/main.py +27 -12
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/image_search_example/main.py +1 -1
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/product_recommendation/main.py +1 -1
- cocoindex-0.1.42/examples/text_embedding/README.md +63 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/text_embedding/main.py +26 -11
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/text_embedding/pyproject.toml +5 -1
- cocoindex-0.1.42/examples/text_embedding_qdrant/README.md +87 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/text_embedding_qdrant/main.py +28 -19
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/text_embedding_qdrant/pyproject.toml +1 -1
- {cocoindex-0.1.40 → cocoindex-0.1.42}/pyproject.toml +4 -1
- {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/__init__.py +5 -4
- cocoindex-0.1.42/python/cocoindex/cli.py +437 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/flow.py +14 -7
- cocoindex-0.1.42/python/cocoindex/lib.py +71 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/typing.py +2 -0
- cocoindex-0.1.42/src/base/duration.rs +674 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/base/json_schema.rs +11 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/base/mod.rs +1 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/base/schema.rs +4 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/base/value.rs +16 -1
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/execution/query.rs +2 -1
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/storages/neo4j.rs +14 -4
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/storages/postgres.rs +12 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/storages/qdrant.rs +9 -2
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/py/convert.rs +6 -2
- cocoindex-0.1.40/examples/code_embedding/README.md +0 -52
- cocoindex-0.1.40/examples/text_embedding/README.md +0 -46
- cocoindex-0.1.40/examples/text_embedding_qdrant/README.md +0 -69
- cocoindex-0.1.40/python/cocoindex/cli.py +0 -238
- cocoindex-0.1.40/python/cocoindex/lib.py +0 -78
- {cocoindex-0.1.40 → cocoindex-0.1.42}/.cargo/config.toml +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/.env.lib_debug +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/.github/ISSUE_TEMPLATE//360/237/220/233-bug-report.md" +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/.github/ISSUE_TEMPLATE//360/237/222/241-feature-request.md" +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/.github/scripts/update_version.sh +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/.github/workflows/CI.yml +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/.github/workflows/_test.yml +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/.github/workflows/docs.yml +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/.github/workflows/release.yml +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/.gitignore +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/.vscode/settings.json +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/CODE_OF_CONDUCT.md +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/CONTRIBUTING.md +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/LICENSE +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/README.md +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/dev/neo4j.yaml +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/dev/postgres.yaml +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/.gitignore +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/README.md +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/about/community.md +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/about/contributing.md +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/ai/llm.mdx +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/core/cli.mdx +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/core/custom_function.mdx +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/core/data_example.svg +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/core/flow_example.svg +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/core/flow_methods.mdx +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/core/initialization.mdx +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/getting_started/installation.md +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/getting_started/markdown_files.zip +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/getting_started/overview.md +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/ops/functions.md +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/ops/sources.md +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docs/ops/storages.md +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/docusaurus.config.ts +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/package.json +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/src/components/HomepageFeatures/index.tsx +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/src/components/HomepageFeatures/styles.module.css +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/src/css/custom.css +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/src/theme/Root.js +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/static/.nojekyll +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/static/img/docusaurus.png +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/static/img/favicon.ico +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/static/img/icon.svg +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/static/robots.txt +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/tsconfig.json +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/docs/yarn.lock +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/amazon_s3_embedding/.env.example +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/amazon_s3_embedding/.gitignore +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/amazon_s3_embedding/README.md +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/amazon_s3_embedding/main.py +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/amazon_s3_embedding/pyproject.toml +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/code_embedding/.env +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/code_embedding/pyproject.toml +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/docs_to_knowledge_graph/.env +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/docs_to_knowledge_graph/README.md +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/docs_to_knowledge_graph/main.py +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/docs_to_knowledge_graph/pyproject.toml +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/fastapi_server_docker/.dockerignore +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/fastapi_server_docker/.env +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/fastapi_server_docker/README.md +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/fastapi_server_docker/compose.yaml +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/fastapi_server_docker/dockerfile +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/fastapi_server_docker/main.py +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/fastapi_server_docker/requirements.txt +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/fastapi_server_docker/sample_code/main.py +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/fastapi_server_docker/src/cocoindex_funs.py +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/gdrive_text_embedding/.env.example +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/gdrive_text_embedding/.gitignore +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/gdrive_text_embedding/README.md +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/gdrive_text_embedding/main.py +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/gdrive_text_embedding/pyproject.toml +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/image_search_example/.env +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/image_search_example/README.md +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/image_search_example/frontend/.gitignore +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/image_search_example/frontend/index.html +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/image_search_example/frontend/package-lock.json +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/image_search_example/frontend/package.json +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/image_search_example/frontend/src/App.jsx +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/image_search_example/frontend/src/main.jsx +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/image_search_example/frontend/src/style.css +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/image_search_example/frontend/vite.config.js +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/image_search_example/img/cat1.jpeg +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/image_search_example/img/dog1.jpeg +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/image_search_example/img/elephant1.jpg +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/image_search_example/img/giraffe.jpg +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/image_search_example/requirements.txt +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/manuals_llm_extraction/.env +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/manuals_llm_extraction/README.md +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/manuals_llm_extraction/main.py +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/manuals_llm_extraction/manuals/array.pdf +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/manuals_llm_extraction/manuals/base64.pdf +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/manuals_llm_extraction/manuals/copy.pdf +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/manuals_llm_extraction/manuals/glob.pdf +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/manuals_llm_extraction/pyproject.toml +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/pdf_embedding/.env +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/pdf_embedding/README.md +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/pdf_embedding/main.py +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/pdf_embedding/pdf_files/1706.03762v7.pdf +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/pdf_embedding/pdf_files/1810.04805v2.pdf +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/pdf_embedding/pdf_files/rfc8259.pdf +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/pdf_embedding/pyproject.toml +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/product_recommendation/.env +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/product_recommendation/README.md +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/product_recommendation/img/cocoinsight.png +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/product_recommendation/img/neo4j.png +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/product_recommendation/products/p1.json +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/product_recommendation/products/p2.json +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/product_recommendation/products/p3.json +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/product_recommendation/products/p4.json +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/product_recommendation/products/p5.json +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/product_recommendation/products/p6.json +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/product_recommendation/products/p7.json +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/product_recommendation/products/p8.json +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/product_recommendation/products/p9.json +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/product_recommendation/pyproject.toml +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/text_embedding/.env +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/text_embedding/Text_Embedding.ipynb +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/text_embedding/markdown_files/1706.03762v7.md +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/text_embedding/markdown_files/1810.04805v2.md +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/text_embedding/markdown_files/rfc8259.md +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/text_embedding_qdrant/.env +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/examples/text_embedding_qdrant/markdown_files/rfc8259.md +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/auth_registry.py +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/convert.py +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/functions.py +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/index.py +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/llm.py +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/op.py +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/py.typed +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/query.py +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/runtime.py +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/setting.py +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/setup.py +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/sources.py +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/storages.py +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/tests/__init__.py +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/tests/test_convert.py +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/python/cocoindex/utils.py +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/base/field_attrs.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/base/spec.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/builder/analyzed_flow.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/builder/analyzer.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/builder/flow_builder.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/builder/mod.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/builder/plan.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/execution/db_tracking.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/execution/db_tracking_setup.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/execution/dumper.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/execution/evaluator.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/execution/indexing_status.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/execution/live_updater.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/execution/memoization.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/execution/mod.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/execution/row_indexer.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/execution/source_indexer.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/execution/stats.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/lib.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/lib_context.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/llm/anthropic.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/llm/gemini.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/llm/mod.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/llm/ollama.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/llm/openai.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/factory_bases.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/functions/extract_by_llm.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/functions/mod.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/functions/parse_json.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/functions/split_recursively.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/interface.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/mod.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/py_factory.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/registration.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/registry.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/sdk.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/sources/amazon_s3.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/sources/google_drive.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/sources/local_file.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/sources/mod.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/storages/mod.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/ops/storages/spec.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/prelude.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/py/mod.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/server.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/service/error.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/service/flows.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/service/mod.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/service/search.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/settings.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/setup/auth_registry.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/setup/components.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/setup/db_metadata.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/setup/driver.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/setup/mod.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/setup/states.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/utils/db.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/utils/fingerprint.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/utils/immutable.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/utils/mod.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/utils/retryable.rs +0 -0
- {cocoindex-0.1.40 → cocoindex-0.1.42}/src/utils/yaml_ser.rs +0 -0
@@ -1,9 +1,10 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: cocoindex
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.42
|
4
4
|
Requires-Dist: sentence-transformers>=3.3.1
|
5
5
|
Requires-Dist: click>=8.1.8
|
6
6
|
Requires-Dist: rich>=14.0.0
|
7
|
+
Requires-Dist: python-dotenv>=1.1.0
|
7
8
|
Requires-Dist: pytest ; extra == 'test'
|
8
9
|
Provides-Extra: test
|
9
10
|
License-File: LICENSE
|
@@ -1,17 +1,17 @@
|
|
1
1
|
---
|
2
|
-
title: Basics
|
3
|
-
description: "CocoIndex basic concepts: indexing flow, data, operations, data updates, etc."
|
2
|
+
title: Indexing Basics
|
3
|
+
description: "CocoIndex basic concepts for indexing: indexing flow, data, operations, data updates, etc."
|
4
4
|
---
|
5
5
|
|
6
|
-
# CocoIndex Basics
|
6
|
+
# CocoIndex Indexing Basics
|
7
7
|
|
8
8
|
An **index** is a collection of data stored in a way that is easy for retrieval.
|
9
9
|
|
10
|
-
CocoIndex is an ETL framework for building indexes from specified data sources, a.k.a. indexing
|
10
|
+
CocoIndex is an ETL framework for building indexes from specified data sources, a.k.a. **indexing**. It also offers utilities for users to retrieve data from the indexes.
|
11
11
|
|
12
|
-
|
12
|
+
An **indexing flow** extracts data from specified data sources, upon specified transformations, and puts the transformed data into specified storage for later retrieval.
|
13
13
|
|
14
|
-
|
14
|
+
## Indexing flow elements
|
15
15
|
|
16
16
|
An indexing flow has two aspects: data and operations on data.
|
17
17
|
|
@@ -42,7 +42,7 @@ An **operation** in an indexing flow defines a step in the flow. An operation is
|
|
42
42
|
|
43
43
|
"import" and "transform" operations produce output data, whose data type is determined based on the operation spec and data types of input data (for "transform" operation only).
|
44
44
|
|
45
|
-
|
45
|
+
## An indexing flow example
|
46
46
|
|
47
47
|
For the example shown in the [Quickstart](../getting_started/quickstart) section, the indexing flow is as follows:
|
48
48
|
|
@@ -60,7 +60,7 @@ This shows schema and example data for the indexing flow:
|
|
60
60
|
|
61
61
|

|
62
62
|
|
63
|
-
|
63
|
+
## Life cycle of an indexing flow
|
64
64
|
|
65
65
|
An indexing flow, once set up, maintains a long-lived relationship between data source and data in target storage. This means:
|
66
66
|
|
@@ -95,19 +95,10 @@ CocoIndex works the same way, but with more powerful capabilities:
|
|
95
95
|
|
96
96
|
This means when writing your flow operations, you can treat source data as if it were static - focusing purely on defining the transformation logic. CocoIndex takes care of maintaining the dynamic relationship between sources and target data behind the scenes.
|
97
97
|
|
98
|
-
|
98
|
+
## Internal storage
|
99
99
|
|
100
100
|
As an indexing flow is long-lived, it needs to store intermediate data to keep track of the states.
|
101
101
|
CocoIndex uses internal storage for this purpose.
|
102
102
|
|
103
103
|
Currently, CocoIndex uses Postgres database as the internal storage.
|
104
|
-
See [Initialization](initialization) for configuring its location, and `cocoindex setup` CLI command (see [CocoIndex CLI](cli)) creates tables for the internal storage.
|
105
|
-
|
106
|
-
## Retrieval
|
107
|
-
|
108
|
-
There are two ways to retrieve data from target storage built by an indexing flow:
|
109
|
-
|
110
|
-
* Query the underlying target storage directly for maximum flexibility.
|
111
|
-
* Use CocoIndex *query handlers* for a more convenient experience with built-in tooling support (e.g. CocoInsight) to understand query performance against the target data.
|
112
|
-
|
113
|
-
Query handlers are tied to specific indexing flows. They accept query inputs, transform them by defined operations, and retrieve matching data from the target storage that was created by the flow.
|
104
|
+
See [Initialization](initialization) for configuring its location, and `cocoindex setup` CLI command (see [CocoIndex CLI](cli)) creates tables for the internal storage.
|
@@ -35,6 +35,7 @@ This is the list of all basic types supported by CocoIndex:
|
|
35
35
|
| Time | | `datetime.time` | `datetime.time` |
|
36
36
|
| LocalDatetime | Date and time without timezone | `cocoindex.LocalDateTime` | `datetime.datetime` |
|
37
37
|
| OffsetDatetime | Date and time with a timezone offset | `cocoindex.OffsetDateTime` | `datetime.datetime` |
|
38
|
+
| TimeDelta | A duration of time | `datetime.timedelta` | `datetime.timedelta` |
|
38
39
|
| Vector[*T*, *Dim*?] | *T* must be basic type. *Dim* is a positive integer and optional. |`cocoindex.Vector[T]` or `cocoindex.Vector[T, Dim]` | `list[T]` |
|
39
40
|
| Json | | `cocoindex.Json` | Any data convertible to JSON by `json` package |
|
40
41
|
|
@@ -54,11 +54,7 @@ Create a new file `quickstart.py` and import the `cocoindex` library:
|
|
54
54
|
import cocoindex
|
55
55
|
```
|
56
56
|
|
57
|
-
Then we'll
|
58
|
-
|
59
|
-
* Define an indexing flow, which specifies a data flow to transform data from specified data source into a vector index.
|
60
|
-
* Define a query handler, which can be used to query the vector index.
|
61
|
-
* A main function, to interact with users and run queries using the query handler above.
|
57
|
+
Then we'll create the indexing flow.
|
62
58
|
|
63
59
|
### Step 2.1: Define the indexing flow
|
64
60
|
|
@@ -121,46 +117,14 @@ Notes:
|
|
121
117
|
|
122
118
|
6. In CocoIndex, a *collector* collects multiple entries of data together. In this example, the `doc_embeddings` collector collects data from all `chunk`s across all `doc`s, and using the collected data to build a vector index `"doc_embeddings"`, using `Postgres`.
|
123
119
|
|
124
|
-
### Step 2.2: Define the
|
125
|
-
|
126
|
-
Starting from the query handler:
|
127
|
-
|
128
|
-
```python title="quickstart.py"
|
129
|
-
query_handler = cocoindex.query.SimpleSemanticsQueryHandler(
|
130
|
-
name="SemanticsSearch",
|
131
|
-
flow=text_embedding_flow,
|
132
|
-
target_name="doc_embeddings",
|
133
|
-
query_transform_flow=lambda text: text.transform(
|
134
|
-
cocoindex.functions.SentenceTransformerEmbed(
|
135
|
-
model="sentence-transformers/all-MiniLM-L6-v2")),
|
136
|
-
default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)
|
137
|
-
```
|
138
|
-
|
139
|
-
This handler queries the vector index `"doc_embeddings"`, and uses the same embedding model `"sentence-transformers/all-MiniLM-L6-v2"` to transform query text into vectors for similarity matching.
|
140
|
-
|
141
|
-
|
142
|
-
### Step 2.3: Define the main function
|
120
|
+
### Step 2.2: Define the main function
|
143
121
|
|
144
|
-
|
122
|
+
We can provide an empty main function for now, with a `@cocoindex.main_fn()` decorator:
|
145
123
|
|
146
124
|
```python title="quickstart.py"
|
147
125
|
@cocoindex.main_fn()
|
148
126
|
def _main():
|
149
|
-
|
150
|
-
while True:
|
151
|
-
try:
|
152
|
-
query = input("Enter search query (or Enter to quit): ")
|
153
|
-
if query == '':
|
154
|
-
break
|
155
|
-
results, _ = query_handler.search(query, 10)
|
156
|
-
print("\nSearch results:")
|
157
|
-
for result in results:
|
158
|
-
print(f"[{result.score:.3f}] {result.data['filename']}")
|
159
|
-
print(f" {result.data['text']}")
|
160
|
-
print("---")
|
161
|
-
print()
|
162
|
-
except KeyboardInterrupt:
|
163
|
-
break
|
127
|
+
pass
|
164
128
|
|
165
129
|
if __name__ == "__main__":
|
166
130
|
_main()
|
@@ -168,10 +132,9 @@ if __name__ == "__main__":
|
|
168
132
|
|
169
133
|
The `@cocoindex.main_fn` declares a function as the main function for an indexing application. This achieves the following effects:
|
170
134
|
|
171
|
-
* Initialize the CocoIndex
|
135
|
+
* Initialize the CocoIndex library states. Settings (e.g. database URL) are loaded from environment variables by default.
|
172
136
|
* When the CLI is invoked with `cocoindex` subcommand, `cocoindex CLI` takes over the control, which provides convenient ways to manage the index. See the next step for more details.
|
173
137
|
|
174
|
-
|
175
138
|
## Step 3: Run the indexing pipeline and queries
|
176
139
|
|
177
140
|
Specify the database URL by environment variable:
|
@@ -206,9 +169,129 @@ It will run for a few seconds and output the following statistics:
|
|
206
169
|
documents: 3 added, 0 removed, 0 updated
|
207
170
|
```
|
208
171
|
|
209
|
-
|
172
|
+
## Step 4 (optional): Run queries against the index
|
173
|
+
|
174
|
+
CocoIndex excels at transforming your data and storing it (a.k.a. indexing).
|
175
|
+
The goal of transforming your data is usually to query against it.
|
176
|
+
Once you already have your index built, you can directly access the transformed data in the target database.
|
177
|
+
CocoIndex also provides utilities for you to do this more seamlessly.
|
178
|
+
|
179
|
+
In this example, we'll use the [`psycopg` library](https://www.psycopg.org/) to connect to the database and run queries.
|
180
|
+
Please make sure it's installed:
|
181
|
+
|
182
|
+
```bash
|
183
|
+
pip install psycopg[binary,pool]
|
184
|
+
```
|
185
|
+
|
186
|
+
### Step 4.1: Extract common transformations
|
187
|
+
|
188
|
+
Between your indexing flow and the query logic, one piece of transformation is shared: compute the embedding of a text.
|
189
|
+
i.e. they should use exactly the same embedding model and parameters.
|
190
|
+
|
191
|
+
Let's extract that into a function:
|
192
|
+
|
193
|
+
```python title="quickstart.py"
|
194
|
+
@cocoindex.transform_flow()
|
195
|
+
def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]:
|
196
|
+
return text.transform(
|
197
|
+
cocoindex.functions.SentenceTransformerEmbed(
|
198
|
+
model="sentence-transformers/all-MiniLM-L6-v2"))
|
199
|
+
```
|
200
|
+
|
201
|
+
`cocoindex.DataSlice[str]` represents certain data in the flow (e.g. a field in a data scope), with type `str` at runtime.
|
202
|
+
Similar to the `text_embedding_flow()` above, the `text_to_embedding()` is also to constructing the flow instead of directly doing computation,
|
203
|
+
so the type it takes is `cocoindex.DataSlice[str]` instead of `str`.
|
204
|
+
See [Data Slice](../core/flow_def#data-slice) for more details.
|
205
|
+
|
206
|
+
|
207
|
+
Then the corresponding code in the indexing flow can be simplified by calling this function:
|
208
|
+
|
209
|
+
```python title="quickstart.py"
|
210
|
+
...
|
211
|
+
# Transform data of each chunk
|
212
|
+
with doc["chunks"].row() as chunk:
|
213
|
+
# Embed the chunk, put into `embedding` field
|
214
|
+
chunk["embedding"] = text_to_embedding(chunk["text"])
|
215
|
+
|
216
|
+
# Collect the chunk into the collector.
|
217
|
+
doc_embeddings.collect(filename=doc["filename"], location=chunk["location"],
|
218
|
+
text=chunk["text"], embedding=chunk["embedding"])
|
219
|
+
...
|
220
|
+
```
|
221
|
+
|
222
|
+
The function decorator `@cocoindex.transform_flow()` is used to declare a function as a CocoIndex transform flow,
|
223
|
+
i.e., a sub flow only performing transformations, without importing data from sources or exporting data to targets.
|
224
|
+
The decorator is needed for evaluating the flow with specific input data in Step 4.2 below.
|
225
|
+
|
226
|
+
### Step 4.2: Provide the query logic
|
227
|
+
|
228
|
+
Now we can create a function to query the index upon a given input query:
|
229
|
+
|
230
|
+
```python title="quickstart.py"
|
231
|
+
from psycopg_pool import ConnectionPool
|
232
|
+
|
233
|
+
def search(pool: ConnectionPool, query: str, top_k: int = 5):
|
234
|
+
# Get the table name, for the export target in the text_embedding_flow above.
|
235
|
+
table_name = cocoindex.utils.get_target_storage_default_name(text_embedding_flow, "doc_embeddings")
|
236
|
+
# Evaluate the transform flow defined above with the input query, to get the embedding.
|
237
|
+
query_vector = text_to_embedding.eval(query)
|
238
|
+
# Run the query and get the results.
|
239
|
+
with pool.connection() as conn:
|
240
|
+
with conn.cursor() as cur:
|
241
|
+
cur.execute(f"""
|
242
|
+
SELECT filename, text, embedding <=> %s::vector AS distance
|
243
|
+
FROM {table_name} ORDER BY distance LIMIT %s
|
244
|
+
""", (query_vector, top_k))
|
245
|
+
return [
|
246
|
+
{"filename": row[0], "text": row[1], "score": 1.0 - row[2]}
|
247
|
+
for row in cur.fetchall()
|
248
|
+
]
|
249
|
+
```
|
250
|
+
|
251
|
+
In the function above, most parts are standard query logic - you can use any libraries you like.
|
252
|
+
There're two CocoIndex-specific logic:
|
253
|
+
|
254
|
+
1. Get the table name from the export target in the `text_embedding_flow` above.
|
255
|
+
Since the table name for the `Postgres` target is not explicitly specified in the `export()` call,
|
256
|
+
CocoIndex uses a default name.
|
257
|
+
`cocoindex.utils.get_target_storage_default_name()` is a utility function to get the default table name for this case.
|
258
|
+
|
259
|
+
2. Evaluate the transform flow defined above with the input query, to get the embedding.
|
260
|
+
It's done by the `eval()` method of the transform flow `text_to_embedding`.
|
261
|
+
The return type of this method is `list[float]` as declared in the `text_to_embedding()` function (`cocoindex.DataSlice[list[float]]`).
|
262
|
+
|
263
|
+
### Step 4.3: Update the main function
|
264
|
+
|
265
|
+
Now we can update the main function to use the query function we just defined:
|
266
|
+
|
267
|
+
```python title="quickstart.py"
|
268
|
+
@cocoindex.main_fn()
|
269
|
+
def _run():
|
270
|
+
# Initialize the database connection pool.
|
271
|
+
pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL"))
|
272
|
+
# Run queries in a loop to demonstrate the query capabilities.
|
273
|
+
while True:
|
274
|
+
try:
|
275
|
+
query = input("Enter search query (or Enter to quit): ")
|
276
|
+
if query == '':
|
277
|
+
break
|
278
|
+
# Run the query function with the database connection pool and the query.
|
279
|
+
results = search(pool, query)
|
280
|
+
print("\nSearch results:")
|
281
|
+
for result in results:
|
282
|
+
print(f"[{result['score']:.3f}] {result['filename']}")
|
283
|
+
print(f" {result['text']}")
|
284
|
+
print("---")
|
285
|
+
print()
|
286
|
+
except KeyboardInterrupt:
|
287
|
+
break
|
288
|
+
```
|
289
|
+
|
290
|
+
It interacts with users and search the database by calling the `search()` method created in Step 4.2.
|
291
|
+
|
292
|
+
### Step 4.4: Run queries against the index
|
210
293
|
|
211
|
-
Now we
|
294
|
+
Now we can run the same Python file, which will run the new main function:
|
212
295
|
|
213
296
|
```bash
|
214
297
|
python quickstart.py
|
@@ -222,5 +305,5 @@ Next, you may want to:
|
|
222
305
|
|
223
306
|
* Learn about [CocoIndex Basics](../core/basics.md).
|
224
307
|
* Learn about other examples in the [examples](https://github.com/cocoindex-io/cocoindex/tree/main/examples) directory.
|
225
|
-
* The `text_embedding` example is this quickstart
|
308
|
+
* The `text_embedding` example is this quickstart.
|
226
309
|
* Pick other examples to learn upon your interest.
|
@@ -0,0 +1,102 @@
|
|
1
|
+
---
|
2
|
+
title: Query Support
|
3
|
+
description: CocoIndex supports vector search and text search.
|
4
|
+
---
|
5
|
+
|
6
|
+
import Tabs from '@theme/Tabs';
|
7
|
+
import TabItem from '@theme/TabItem';
|
8
|
+
|
9
|
+
# CocoIndex Query Support
|
10
|
+
|
11
|
+
The main functionality of CocoIndex is indexing.
|
12
|
+
The goal of indexing is to enable efficient querying against your data.
|
13
|
+
You can use any libraries or frameworks of your choice to perform queries.
|
14
|
+
At the same time, CocoIndex provides seamless integration between indexing and querying workflows.
|
15
|
+
For example, you can share transformations between indexing and querying, and easily retrieve table names when using CocoIndex's default naming conventions.
|
16
|
+
|
17
|
+
## Transform Flow
|
18
|
+
|
19
|
+
Sometimes a part of the transformation logic needs to be shared between indexing and querying,
|
20
|
+
e.g. when we build a vector index and query against it, the embedding computation needs to be consistent between indexing and querying.
|
21
|
+
|
22
|
+
In this case, you can:
|
23
|
+
|
24
|
+
1. Extract a sub-flow with the shared transformation logic into a standalone function.
|
25
|
+
* It takes one or more data slices as input.
|
26
|
+
* It returns one data slice as output.
|
27
|
+
* You need to annotate data types for both inputs and outputs as type parameter for `cocoindex.DataSlice[T]`. See [data types](./core/data_types.mdx) for more details about supported data types.
|
28
|
+
|
29
|
+
2. When you're defining your indexing flow, you can directly call the function.
|
30
|
+
The body will be executed, so that the transformation logic will be added as part of the indexing flow.
|
31
|
+
|
32
|
+
3. At query time, you usually want to directly run the function with specific input data, instead of letting it called as part of a long-lived indexing flow.
|
33
|
+
To do this, declare the function as a *transform flow*, by decorating it with `@cocoindex.transform_flow()`.
|
34
|
+
This will add `eval()` and `eval_async()` methods to the function, so that you can directly call with specific input data.
|
35
|
+
|
36
|
+
|
37
|
+
<Tabs>
|
38
|
+
<TabItem value="python" label="Python">
|
39
|
+
|
40
|
+
The [quickstart](getting_started/quickstart#step-41-extract-common-transformations) shows an example:
|
41
|
+
|
42
|
+
```python
|
43
|
+
@cocoindex.transform_flow()
|
44
|
+
def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]:
|
45
|
+
return text.transform(
|
46
|
+
cocoindex.functions.SentenceTransformerEmbed(
|
47
|
+
model="sentence-transformers/all-MiniLM-L6-v2"))
|
48
|
+
```
|
49
|
+
|
50
|
+
When you're defining your indexing flow, you can directly call the function:
|
51
|
+
|
52
|
+
```python
|
53
|
+
with doc["chunks"].row() as chunk:
|
54
|
+
chunk["embedding"] = text_to_embedding(chunk["text"])
|
55
|
+
```
|
56
|
+
|
57
|
+
or, using the `call()` method of the transform flow on the first argument, to make operations chainable:
|
58
|
+
|
59
|
+
```python
|
60
|
+
with doc["chunks"].row() as chunk:
|
61
|
+
chunk["embedding"] = chunk["text"].call(text_to_embedding)
|
62
|
+
```
|
63
|
+
|
64
|
+
Any time, you can call the `eval()` method with specific string, which will return a `list[float]`:
|
65
|
+
|
66
|
+
```python
|
67
|
+
print(text_to_embedding.eval("Hello, world!"))
|
68
|
+
```
|
69
|
+
|
70
|
+
If you're in an async context, please call the `eval_async()` method instead:
|
71
|
+
|
72
|
+
```python
|
73
|
+
print(await text_to_embedding.eval_async("Hello, world!"))
|
74
|
+
```
|
75
|
+
|
76
|
+
</TabItem>
|
77
|
+
</Tabs>
|
78
|
+
|
79
|
+
## Get Target Native Names
|
80
|
+
|
81
|
+
In your indexing flow, when you export data to a target, you can specify the target name (e.g. a database table name, a collection name, the node label in property graph databases, etc.) explicitly,
|
82
|
+
or for some backends you can also omit it and let CocoIndex generate a default name for you.
|
83
|
+
For the latter case, CocoIndex provides a utility function `cocoindex.utils.get_target_storage_default_name()` to get the default name.
|
84
|
+
It takes the following arguments:
|
85
|
+
|
86
|
+
* `flow` (type: `cocoindex.Flow`): The flow to get the default name for.
|
87
|
+
* `target_name` (type: `str`): The export target name, appeared in the `export()` call.
|
88
|
+
|
89
|
+
For example:
|
90
|
+
|
91
|
+
<Tabs>
|
92
|
+
<TabItem value="python" label="Python">
|
93
|
+
|
94
|
+
```python
|
95
|
+
table_name = cocoindex.utils.get_target_storage_default_name(text_embedding_flow, "doc_embeddings")
|
96
|
+
query = f"SELECT filename, text FROM {table_name} ORDER BY embedding <=> %s::vector DESC LIMIT 5"
|
97
|
+
...
|
98
|
+
```
|
99
|
+
|
100
|
+
</TabItem>
|
101
|
+
</Tabs>
|
102
|
+
|
@@ -0,0 +1,71 @@
|
|
1
|
+
# Build real-time index for codebase
|
2
|
+
[](https://github.com/cocoindex-io/cocoindex)
|
3
|
+
|
4
|
+
CocoIndex provides built-in support for code base chunking, using Tree-sitter to keep syntax boundary. In this example, we will build real-time index for codebase using CocoIndex.
|
5
|
+
|
6
|
+
We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful.
|
7
|
+
|
8
|
+

|
9
|
+
|
10
|
+
[Tree-sitter](https://en.wikipedia.org/wiki/Tree-sitter_%28parser_generator%29) is a parser generator tool and an incremental parsing library. It is available in Rust 🦀 - [GitHub](https://github.com/tree-sitter/tree-sitter). CocoIndex has built-in Rust integration with Tree-sitter to efficiently parse code and extract syntax trees for various programming languages. Check out the list of supported languages [here](https://cocoindex.io/docs/ops/functions#splitrecursively) - in the `language` section.
|
11
|
+
|
12
|
+
|
13
|
+
## Tutorials
|
14
|
+
- Step by step tutorial - Check out the [blog](https://cocoindex.io/blogs/index-code-base-for-rag).
|
15
|
+
- Video tutorial - [Youtube](https://youtu.be/G3WstvhHO24?si=Bnxu67Ax5Lv8b-J2).
|
16
|
+
|
17
|
+
## Steps
|
18
|
+
|
19
|
+
### Indexing Flow
|
20
|
+
<p align='center'>
|
21
|
+
<img width="434" alt="Screenshot 2025-05-19 at 10 14 36 PM" src="https://github.com/user-attachments/assets/3a506034-698f-480a-b653-22184dae4e14" />
|
22
|
+
</p>
|
23
|
+
|
24
|
+
1. We will ingest CocoIndex codebase.
|
25
|
+
2. For each file, perform chunking (Tree-sitter) and then embedding.
|
26
|
+
3. We will save the embeddings and the metadata in Postgres with PGVector.
|
27
|
+
|
28
|
+
### Query:
|
29
|
+
We will match against user-provided text by a SQL query, reusing the embedding operation in the indexing flow.
|
30
|
+
|
31
|
+
|
32
|
+
## Prerequisite
|
33
|
+
[Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one.
|
34
|
+
|
35
|
+
## Run
|
36
|
+
|
37
|
+
- Install dependencies:
|
38
|
+
```bash
|
39
|
+
pip install -e .
|
40
|
+
```
|
41
|
+
|
42
|
+
- Setup:
|
43
|
+
|
44
|
+
```bash
|
45
|
+
python main.py cocoindex setup
|
46
|
+
```
|
47
|
+
|
48
|
+
- Update index:
|
49
|
+
|
50
|
+
```bash
|
51
|
+
python main.py cocoindex update
|
52
|
+
```
|
53
|
+
|
54
|
+
- Run:
|
55
|
+
|
56
|
+
```bash
|
57
|
+
python main.py
|
58
|
+
```
|
59
|
+
|
60
|
+
## CocoInsight
|
61
|
+
I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline.
|
62
|
+
It just connects to your local CocoIndex server, with Zero pipeline data retention. Run the following command to start CocoInsight:
|
63
|
+
|
64
|
+
```
|
65
|
+
python main.py cocoindex server -ci
|
66
|
+
```
|
67
|
+
|
68
|
+
Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight).
|
69
|
+
|
70
|
+
<img width="1305" alt="Chunking Visualization" src="https://github.com/user-attachments/assets/8e83b9a4-2bed-456b-83e5-b5381b28b84a" />
|
71
|
+
|
@@ -1,5 +1,5 @@
|
|
1
1
|
from dotenv import load_dotenv
|
2
|
-
|
2
|
+
from psycopg_pool import ConnectionPool
|
3
3
|
import cocoindex
|
4
4
|
import os
|
5
5
|
|
@@ -8,7 +8,8 @@ def extract_extension(filename: str) -> str:
|
|
8
8
|
"""Extract the extension of a filename."""
|
9
9
|
return os.path.splitext(filename)[1]
|
10
10
|
|
11
|
-
|
11
|
+
@cocoindex.transform_flow()
|
12
|
+
def code_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]:
|
12
13
|
"""
|
13
14
|
Embed the text using a SentenceTransformer model.
|
14
15
|
"""
|
@@ -24,7 +25,7 @@ def code_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind
|
|
24
25
|
data_scope["files"] = flow_builder.add_source(
|
25
26
|
cocoindex.sources.LocalFile(path="../..",
|
26
27
|
included_patterns=["*.py", "*.rs", "*.toml", "*.md", "*.mdx"],
|
27
|
-
excluded_patterns=["
|
28
|
+
excluded_patterns=["**/.*", "target", "**/node_modules"]))
|
28
29
|
code_embeddings = data_scope.add_collector()
|
29
30
|
|
30
31
|
with data_scope["files"].row() as file:
|
@@ -47,26 +48,40 @@ def code_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind
|
|
47
48
|
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
|
48
49
|
|
49
50
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
51
|
+
|
52
|
+
def search(pool: ConnectionPool, query: str, top_k: int = 5):
|
53
|
+
# Get the table name, for the export target in the code_embedding_flow above.
|
54
|
+
table_name = cocoindex.utils.get_target_storage_default_name(code_embedding_flow, "code_embeddings")
|
55
|
+
# Evaluate the transform flow defined above with the input query, to get the embedding.
|
56
|
+
query_vector = code_to_embedding.eval(query)
|
57
|
+
# Run the query and get the results.
|
58
|
+
with pool.connection() as conn:
|
59
|
+
with conn.cursor() as cur:
|
60
|
+
cur.execute(f"""
|
61
|
+
SELECT filename, code, embedding <=> %s::vector AS distance
|
62
|
+
FROM {table_name} ORDER BY distance LIMIT %s
|
63
|
+
""", (query_vector, top_k))
|
64
|
+
return [
|
65
|
+
{"filename": row[0], "code": row[1], "score": 1.0 - row[2]}
|
66
|
+
for row in cur.fetchall()
|
67
|
+
]
|
56
68
|
|
57
69
|
@cocoindex.main_fn()
|
58
70
|
def _run():
|
71
|
+
# Initialize the database connection pool.
|
72
|
+
pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL"))
|
59
73
|
# Run queries in a loop to demonstrate the query capabilities.
|
60
74
|
while True:
|
61
75
|
try:
|
62
76
|
query = input("Enter search query (or Enter to quit): ")
|
63
77
|
if query == '':
|
64
78
|
break
|
65
|
-
|
79
|
+
# Run the query function with the database connection pool and the query.
|
80
|
+
results = search(pool, query)
|
66
81
|
print("\nSearch results:")
|
67
82
|
for result in results:
|
68
|
-
print(f"[{result
|
69
|
-
print(f" {result
|
83
|
+
print(f"[{result['score']:.3f}] {result['filename']}")
|
84
|
+
print(f" {result['code']}")
|
70
85
|
print("---")
|
71
86
|
print()
|
72
87
|
except KeyboardInterrupt:
|
@@ -94,7 +94,7 @@ app.mount("/img", StaticFiles(directory="img"), name="img")
|
|
94
94
|
# --- CocoIndex initialization on startup ---
|
95
95
|
@app.on_event("startup")
|
96
96
|
def startup_event():
|
97
|
-
settings = cocoindex.
|
97
|
+
settings = cocoindex.Settings.from_env()
|
98
98
|
cocoindex.init(settings)
|
99
99
|
app.state.query_handler = cocoindex.query.SimpleSemanticsQueryHandler(
|
100
100
|
name="ImageObjectSearch",
|
@@ -61,7 +61,7 @@ class ProductTaxonomyInfo:
|
|
61
61
|
complementary_taxonomies: list[ProductTaxonomy]
|
62
62
|
|
63
63
|
@cocoindex.op.function(behavior_version=2)
|
64
|
-
def extract_product_info(product: cocoindex.
|
64
|
+
def extract_product_info(product: cocoindex.Json, filename: str) -> ProductInfo:
|
65
65
|
# Print markdown for LLM to extract the taxonomy and complimentary taxonomy
|
66
66
|
return ProductInfo(
|
67
67
|
id=f"{filename.removesuffix('.json')}",
|