cocoindex 0.1.76__tar.gz → 0.1.78__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cocoindex-0.1.76 → cocoindex-0.1.78}/Cargo.lock +1 -1
- {cocoindex-0.1.76 → cocoindex-0.1.78}/Cargo.toml +1 -1
- {cocoindex-0.1.76 → cocoindex-0.1.78}/PKG-INFO +2 -1
- {cocoindex-0.1.76 → cocoindex-0.1.78}/README.md +1 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/docs/core/data_types.mdx +20 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/docs/core/flow_def.mdx +28 -9
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/docs/core/settings.mdx +22 -11
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/docs/ops/functions.md +9 -9
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/image_search/pyproject.toml +1 -1
- cocoindex-0.1.78/examples/multi_format_indexing/README.md +71 -0
- cocoindex-0.1.78/examples/multi_format_indexing/main.py +135 -0
- cocoindex-0.1.78/examples/multi_format_indexing/pyproject.toml +14 -0
- cocoindex-0.1.78/examples/multi_format_indexing/source_files/cat1.jpeg +0 -0
- cocoindex-0.1.78/examples/multi_format_indexing/source_files/dog1.jpeg +0 -0
- cocoindex-0.1.78/examples/multi_format_indexing/source_files/elephant1.jpg +0 -0
- cocoindex-0.1.78/examples/multi_format_indexing/source_files/giraffe.jpg +0 -0
- cocoindex-0.1.78/examples/pdf_embedding/pdf_files/1706.03762v7.pdf +0 -0
- cocoindex-0.1.78/examples/pdf_embedding/pdf_files/1810.04805v2.pdf +0 -0
- cocoindex-0.1.78/examples/pdf_embedding/pdf_files/rfc8259.pdf +0 -0
- cocoindex-0.1.78/examples/text_embedding_qdrant/.env +2 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/python/cocoindex/op.py +65 -39
- {cocoindex-0.1.76 → cocoindex-0.1.78}/python/cocoindex/setting.py +16 -2
- {cocoindex-0.1.76 → cocoindex-0.1.78}/python/cocoindex/tests/test_transform_flow.py +41 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/execution/evaluator.rs +11 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/lib_context.rs +28 -2
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/settings.rs +17 -3
- {cocoindex-0.1.76 → cocoindex-0.1.78}/.cargo/config.toml +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/.env.lib_debug +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/.github/ISSUE_TEMPLATE//360/237/220/233-bug-report.md" +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/.github/ISSUE_TEMPLATE//360/237/222/241-feature-request.md" +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/.github/scripts/update_version.sh +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/.github/workflows/CI.yml +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/.github/workflows/_doc_release.yml +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/.github/workflows/_test.yml +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/.github/workflows/docs.yml +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/.github/workflows/format.yml +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/.github/workflows/release.yml +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/.gitignore +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/.pre-commit-config.yaml +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/CODE_OF_CONDUCT.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/CONTRIBUTING.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/LICENSE +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/dev/neo4j.yaml +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/dev/postgres.yaml +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/.gitignore +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/README.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/docs/about/community.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/docs/about/contributing.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/docs/ai/llm.mdx +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/docs/core/basics.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/docs/core/cli.mdx +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/docs/core/data_example.svg +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/docs/core/flow_example.svg +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/docs/core/flow_methods.mdx +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/docs/custom_ops/custom_functions.mdx +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/docs/custom_ops/custom_targets.mdx +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/docs/getting_started/installation.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/docs/getting_started/markdown_files.zip +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/docs/getting_started/overview.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/docs/getting_started/quickstart.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/docs/ops/sources.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/docs/ops/targets.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/docs/query.mdx +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/docs/tutorials/live_updates.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/docs/tutorials/manage_flow_dynamically.mdx +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/docusaurus.config.ts +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/package.json +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/sidebars.ts +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/src/components/HomepageFeatures/index.tsx +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/src/components/HomepageFeatures/styles.module.css +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/src/css/custom.css +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/src/theme/Root.js +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/static/.nojekyll +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/static/img/docusaurus.png +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/static/img/favicon.ico +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/static/img/icon.svg +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/static/img/incremental-etl.gif +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/static/robots.txt +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/tsconfig.json +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/docs/yarn.lock +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/amazon_s3_embedding/.env.example +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/amazon_s3_embedding/.gitignore +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/amazon_s3_embedding/README.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/amazon_s3_embedding/main.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/amazon_s3_embedding/pyproject.toml +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/azure_blob_embedding/.env.example +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/azure_blob_embedding/.gitignore +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/azure_blob_embedding/README.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/azure_blob_embedding/main.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/azure_blob_embedding/pyproject.toml +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/code_embedding/.env +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/code_embedding/README.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/code_embedding/main.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/code_embedding/pyproject.toml +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/custom_output_files/.env +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/custom_output_files/.gitignore +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/custom_output_files/README.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/custom_output_files/data/bizarre_animals.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/custom_output_files/data/chunk_norris.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/custom_output_files/main.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/custom_output_files/pyproject.toml +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/docs_to_knowledge_graph/.env +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/docs_to_knowledge_graph/README.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/docs_to_knowledge_graph/main.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/docs_to_knowledge_graph/pyproject.toml +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/face_recognition/.env +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/face_recognition/README.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/face_recognition/images/Carter_welcomes_Reagan.jpg +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/face_recognition/images/Solvay_conference_1927.jpg +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/face_recognition/images/Steve_Jobs_and_Bill_Gates_(522695099).jpg +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/face_recognition/images/einplanck3.jpg +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/face_recognition/main.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/face_recognition/pyproject.toml +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/fastapi_server_docker/.dockerignore +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/fastapi_server_docker/.env +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/fastapi_server_docker/README.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/fastapi_server_docker/compose.yaml +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/fastapi_server_docker/dockerfile +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/fastapi_server_docker/files/1810.04805v2.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/fastapi_server_docker/main.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/fastapi_server_docker/requirements.txt +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/gdrive_text_embedding/.env.example +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/gdrive_text_embedding/.gitignore +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/gdrive_text_embedding/README.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/gdrive_text_embedding/main.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/gdrive_text_embedding/pyproject.toml +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/image_search/.env +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/image_search/README.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/image_search/colpali_main.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/image_search/frontend/.gitignore +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/image_search/frontend/index.html +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/image_search/frontend/package-lock.json +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/image_search/frontend/package.json +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/image_search/frontend/src/App.jsx +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/image_search/frontend/src/main.jsx +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/image_search/frontend/src/style.css +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/image_search/frontend/vite.config.js +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/image_search/img/cat1.jpeg +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/image_search/img/dog1.jpeg +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/image_search/img/elephant1.jpg +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/image_search/img/giraffe.jpg +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/image_search/main.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/live_updates/.env +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/live_updates/README.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/live_updates/data/bizarre_animals.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/live_updates/data/chunk_norris.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/live_updates/main.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/live_updates/pyproject.toml +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/manuals_llm_extraction/.env +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/manuals_llm_extraction/README.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/manuals_llm_extraction/main.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/manuals_llm_extraction/manuals/array.pdf +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/manuals_llm_extraction/manuals/base64.pdf +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/manuals_llm_extraction/manuals/copy.pdf +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/manuals_llm_extraction/manuals/glob.pdf +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/manuals_llm_extraction/pyproject.toml +0 -0
- {cocoindex-0.1.76/examples/pdf_embedding → cocoindex-0.1.78/examples/multi_format_indexing}/.env +0 -0
- {cocoindex-0.1.76/examples/paper_metadata/papers → cocoindex-0.1.78/examples/multi_format_indexing/source_files}/1706.03762v7.pdf +0 -0
- {cocoindex-0.1.76/examples/paper_metadata/papers → cocoindex-0.1.78/examples/multi_format_indexing/source_files}/1810.04805v2.pdf +0 -0
- {cocoindex-0.1.76/examples/pdf_embedding/pdf_files → cocoindex-0.1.78/examples/multi_format_indexing/source_files}/rfc8259.pdf +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/paper_metadata/.env.example +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/paper_metadata/.gitignore +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/paper_metadata/README.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/paper_metadata/main.py +0 -0
- {cocoindex-0.1.76/examples/pdf_embedding/pdf_files → cocoindex-0.1.78/examples/paper_metadata/papers}/1706.03762v7.pdf +0 -0
- {cocoindex-0.1.76/examples/pdf_embedding/pdf_files → cocoindex-0.1.78/examples/paper_metadata/papers}/1810.04805v2.pdf +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/paper_metadata/papers/2502.06786v3.pdf +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/paper_metadata/papers/2502.20346v1.pdf +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/paper_metadata/pyproject.toml +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/patient_intake_extraction/.env.example +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/patient_intake_extraction/README.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/patient_intake_extraction/data/README.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/patient_intake_extraction/data/patient_forms/Patient_Intake_Form_David_Artificial.docx +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/patient_intake_extraction/data/patient_forms/Patient_Intake_Form_Emily_Artificial.pdf +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/patient_intake_extraction/data/patient_forms/Patient_Intake_Form_Joe_Artificial.pdf +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/patient_intake_extraction/data/patient_forms/Patient_Intake_From_Jane_Artificial.docx +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/patient_intake_extraction/main.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/patient_intake_extraction/pyproject.toml +0 -0
- {cocoindex-0.1.76/examples/product_recommendation → cocoindex-0.1.78/examples/pdf_embedding}/.env +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/pdf_embedding/README.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/pdf_embedding/main.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/pdf_embedding/pyproject.toml +0 -0
- {cocoindex-0.1.76/examples/text_embedding → cocoindex-0.1.78/examples/product_recommendation}/.env +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/product_recommendation/README.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/product_recommendation/img/cocoinsight.png +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/product_recommendation/img/neo4j.png +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/product_recommendation/main.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/product_recommendation/products/p1.json +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/product_recommendation/products/p2.json +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/product_recommendation/products/p3.json +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/product_recommendation/products/p4.json +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/product_recommendation/products/p5.json +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/product_recommendation/products/p6.json +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/product_recommendation/products/p7.json +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/product_recommendation/products/p8.json +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/product_recommendation/products/p9.json +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/product_recommendation/pyproject.toml +0 -0
- {cocoindex-0.1.76/examples/text_embedding_qdrant → cocoindex-0.1.78/examples/text_embedding}/.env +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/text_embedding/README.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/text_embedding/Text_Embedding.ipynb +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/text_embedding/main.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/text_embedding/markdown_files/1706.03762v7.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/text_embedding/markdown_files/1810.04805v2.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/text_embedding/markdown_files/rfc8259.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/text_embedding/pyproject.toml +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/text_embedding_qdrant/README.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/text_embedding_qdrant/main.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/text_embedding_qdrant/markdown_files/rfc8259.md +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/examples/text_embedding_qdrant/pyproject.toml +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/pyproject.toml +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/python/cocoindex/__init__.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/python/cocoindex/auth_registry.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/python/cocoindex/cli.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/python/cocoindex/convert.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/python/cocoindex/flow.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/python/cocoindex/functions.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/python/cocoindex/index.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/python/cocoindex/lib.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/python/cocoindex/llm.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/python/cocoindex/py.typed +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/python/cocoindex/runtime.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/python/cocoindex/setup.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/python/cocoindex/sources.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/python/cocoindex/targets.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/python/cocoindex/tests/__init__.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/python/cocoindex/tests/test_convert.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/python/cocoindex/tests/test_optional_database.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/python/cocoindex/tests/test_typing.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/python/cocoindex/tests/test_validation.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/python/cocoindex/typing.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/python/cocoindex/utils.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/python/cocoindex/validation.py +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/ruff.toml +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/base/duration.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/base/field_attrs.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/base/json_schema.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/base/mod.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/base/schema.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/base/spec.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/base/value.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/builder/analyzed_flow.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/builder/analyzer.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/builder/exec_ctx.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/builder/flow_builder.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/builder/mod.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/builder/plan.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/execution/db_tracking.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/execution/db_tracking_setup.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/execution/dumper.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/execution/indexing_status.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/execution/live_updater.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/execution/memoization.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/execution/mod.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/execution/row_indexer.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/execution/source_indexer.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/execution/stats.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/lib.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/llm/anthropic.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/llm/gemini.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/llm/litellm.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/llm/mod.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/llm/ollama.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/llm/openai.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/llm/openrouter.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/llm/vllm.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/llm/voyage.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/ops/factory_bases.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/ops/functions/embed_text.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/ops/functions/extract_by_llm.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/ops/functions/mod.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/ops/functions/parse_json.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/ops/functions/split_recursively.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/ops/functions/test_utils.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/ops/interface.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/ops/mod.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/ops/py_factory.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/ops/registration.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/ops/registry.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/ops/sdk.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/ops/sources/amazon_s3.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/ops/sources/azure_blob.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/ops/sources/google_drive.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/ops/sources/local_file.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/ops/sources/mod.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/ops/targets/kuzu.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/ops/targets/mod.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/ops/targets/neo4j.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/ops/targets/postgres.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/ops/targets/qdrant.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/ops/targets/shared/mod.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/ops/targets/shared/property_graph.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/ops/targets/shared/table_columns.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/prelude.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/py/convert.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/py/mod.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/server.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/service/error.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/service/flows.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/service/mod.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/setup/auth_registry.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/setup/components.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/setup/db_metadata.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/setup/driver.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/setup/mod.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/setup/states.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/utils/concur_control.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/utils/db.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/utils/fingerprint.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/utils/immutable.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/utils/mod.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/utils/retryable.rs +0 -0
- {cocoindex-0.1.76 → cocoindex-0.1.78}/src/utils/yaml_ser.rs +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cocoindex
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.78
|
|
4
4
|
Requires-Dist: click>=8.1.8
|
|
5
5
|
Requires-Dist: rich>=14.0.0
|
|
6
6
|
Requires-Dist: python-dotenv>=1.1.0
|
|
@@ -214,6 +214,7 @@ It defines an index flow like this:
|
|
|
214
214
|
| [Image Search with Vision API](examples/image_search) | Generates detailed captions for images using a vision model, embeds them, enables live-updating semantic search via FastAPI and served on a React frontend|
|
|
215
215
|
| [Face Recognition](examples/face_recognition) | Recognize faces in images and build embedding index |
|
|
216
216
|
| [Paper Metadata](examples/paper_metadata) | Index papers in PDF files, and build metadata tables for each paper |
|
|
217
|
+
| [Multi Format Indexing](examples/multi_format_indexing) | Build visual document index from PDFs and images with ColPali for semantic search |
|
|
217
218
|
| [Custom Output Files](examples/custom_output_files) | Convert markdown files to HTML files and save them to a local directory, using *CocoIndex Custom Targets* |
|
|
218
219
|
|
|
219
220
|
More coming and stay tuned 👀!
|
|
@@ -185,6 +185,7 @@ It defines an index flow like this:
|
|
|
185
185
|
| [Image Search with Vision API](examples/image_search) | Generates detailed captions for images using a vision model, embeds them, enables live-updating semantic search via FastAPI and served on a React frontend|
|
|
186
186
|
| [Face Recognition](examples/face_recognition) | Recognize faces in images and build embedding index |
|
|
187
187
|
| [Paper Metadata](examples/paper_metadata) | Index papers in PDF files, and build metadata tables for each paper |
|
|
188
|
+
| [Multi Format Indexing](examples/multi_format_indexing) | Build visual document index from PDFs and images with ColPali for semantic search |
|
|
188
189
|
| [Custom Output Files](examples/custom_output_files) | Convert markdown files to HTML files and save them to a local directory, using *CocoIndex Custom Targets* |
|
|
189
190
|
|
|
190
191
|
More coming and stay tuned 👀!
|
|
@@ -199,3 +199,23 @@ Currently, the following types are key types
|
|
|
199
199
|
- *Uuid*
|
|
200
200
|
- *Date*
|
|
201
201
|
- *Struct* with all fields being key types (using `@dataclass(frozen=True)` or `NamedTuple`)
|
|
202
|
+
|
|
203
|
+
## *Null* Values
|
|
204
|
+
|
|
205
|
+
CocoIndex supports *Null* values. A *Null* value represents the absence of data or an unknown value, distinct from empty strings, zero numbers, or false boolean values.
|
|
206
|
+
|
|
207
|
+
### Nullable Type
|
|
208
|
+
|
|
209
|
+
For any data (e.g. a field of a *Struct*, an argument or return value of a CocoIndex function), if it is nullable, it means its value can be *Null*.
|
|
210
|
+
We use a `?` suffix to indicate a nullable type, e.g. *Str?*, *Person?*.
|
|
211
|
+
|
|
212
|
+
In Python, *Null* is represented as `None`, so a nullable type can be represented by `T | None` or `typing.Optional[T]`.
|
|
213
|
+
|
|
214
|
+
### *Null* propagating on CocoIndex functions
|
|
215
|
+
|
|
216
|
+
A function may specify whether each input argument is nullable or not.
|
|
217
|
+
Non-nullable argument means the function needs a known value for the argument to work.
|
|
218
|
+
However, it doesn't forbid the argument to be *Null* at runtime.
|
|
219
|
+
When a non-nullable argument receives a *Null* value, the function execution is skipped and the result is *Null*.
|
|
220
|
+
|
|
221
|
+
For example, for [`SplitRecursively` function](/docs/ops/functions#splitrecursively), the `text` and `chunk_size` arguments are not nullable. If the input value of either of them is *Null*, the function will return *Null*.
|
|
@@ -360,21 +360,40 @@ It will use `Staging__doc_embeddings` as the collection name if the current app
|
|
|
360
360
|
|
|
361
361
|
### Control Processing Concurrency
|
|
362
362
|
|
|
363
|
-
|
|
363
|
+
CocoIndex processes data in parallel to maximize throughput, but unconstrained parallelism can overwhelm your system.
|
|
364
|
+
Processing too many items simultaneously can lead to:
|
|
364
365
|
|
|
365
|
-
|
|
366
|
-
|
|
366
|
+
- **Memory exhaustion**: Large datasets loaded concurrently can consume excessive RAM
|
|
367
|
+
- **Resource contention**: Too many parallel operations competing for CPU, disk I/O, or network bandwidth
|
|
368
|
+
- **System instability**: High concurrency can cause timeouts, crashes, or degraded performance
|
|
367
369
|
|
|
368
|
-
|
|
370
|
+
To prevent these issues, CocoIndex provides concurrency controls that limit how many data items are processed simultaneously.
|
|
369
371
|
|
|
370
|
-
|
|
372
|
+
#### Concurrency Options
|
|
371
373
|
|
|
372
|
-
|
|
373
|
-
If both global and per-source limits are specified, both need to be satisfied to admit additional source rows.
|
|
374
|
+
You can control processing concurrency using these options:
|
|
374
375
|
|
|
375
|
-
*
|
|
376
|
+
* `max_inflight_rows`: Limits the maximum number of data rows being processed concurrently
|
|
377
|
+
* `max_inflight_bytes`: Limits the total memory footprint of data being processed concurrently (measured in bytes)
|
|
376
378
|
|
|
377
|
-
|
|
379
|
+
When these limits are reached, CocoIndex will pause loading new data until some of the current processing completes, ensuring your system remains stable.
|
|
380
|
+
|
|
381
|
+
#### Where to Apply Concurrency Controls
|
|
382
|
+
|
|
383
|
+
These concurrency options can be configured at different levels:
|
|
384
|
+
|
|
385
|
+
* **Source level** via [`FlowBuilder.add_source()`](#import-from-source): Controls how many rows from a data source are processed simultaneously. This prevents overwhelming your system when ingesting large datasets.
|
|
386
|
+
|
|
387
|
+
You can also set global limits across all sources and flows using [`GlobalExecutionOptions`](/docs/core/settings#globalexecutionoptions) or environment variables [`COCOINDEX_SOURCE_MAX_INFLIGHT_ROWS`](/docs/core/settings#list-of-environment-variables)/[`COCOINDEX_SOURCE_MAX_INFLIGHT_BYTES`](/docs/core/settings#list-of-environment-variables).
|
|
388
|
+
When both global and per-source limits are specified, both limits are enforced independently - a new row can only be processed if there's available capacity in both the global budget (shared across all sources) and the per-source budget (specific to that source).
|
|
389
|
+
|
|
390
|
+
* **Row iteration level** via [`DataSlice.row()`](#for-each-row): Provides fine-grained control over parallel processing within nested data structures, allowing you to tune concurrency at any level of your data hierarchy.
|
|
391
|
+
|
|
392
|
+
:::note
|
|
393
|
+
|
|
394
|
+
The `max_inflight_bytes` limit only counts the size of data that already exists in memory before any transformations are applied. It doesn't include the memory used by intermediate processing results.
|
|
395
|
+
|
|
396
|
+
:::
|
|
378
397
|
|
|
379
398
|
For example:
|
|
380
399
|
|
|
@@ -77,24 +77,33 @@ If not set, all flows are in a default unnamed namespace.
|
|
|
77
77
|
|
|
78
78
|
`DatabaseConnectionSpec` configures the connection to a database. Only Postgres is supported for now. It has the following fields:
|
|
79
79
|
|
|
80
|
-
* `url` (type: `str
|
|
80
|
+
* `url` (type: `str`): The URL of the Postgres database to use as the internal storage, e.g. `postgres://cocoindex:cocoindex@localhost/cocoindex`.
|
|
81
81
|
|
|
82
82
|
*Environment variable* for `Settings.database.url`: `COCOINDEX_DATABASE_URL`
|
|
83
83
|
|
|
84
|
-
* `user` (type: `str`,
|
|
84
|
+
* `user` (type: `str | None`, default: `None`): The username for the Postgres database. If not provided, username will come from `url`.
|
|
85
85
|
|
|
86
86
|
*Environment variable* for `Settings.database.user`: `COCOINDEX_DATABASE_USER`
|
|
87
87
|
|
|
88
|
-
* `password` (type: `str`,
|
|
88
|
+
* `password` (type: `str | None`, default: `None`): The password for the Postgres database. If not provided, password will come from `url`.
|
|
89
89
|
|
|
90
90
|
*Environment variable* for `Settings.database.password`: `COCOINDEX_DATABASE_PASSWORD`
|
|
91
91
|
|
|
92
|
-
:::tip
|
|
92
|
+
:::tip
|
|
93
93
|
|
|
94
|
-
Please be careful that all values in `url` needs to be url-encoded if they contain special characters.
|
|
95
|
-
For this reason, prefer to use the separated `user` and `password` fields for username and password.
|
|
94
|
+
Please be careful that all values in `url` needs to be url-encoded if they contain special characters.
|
|
95
|
+
For this reason, prefer to use the separated `user` and `password` fields for username and password.
|
|
96
|
+
|
|
97
|
+
:::
|
|
98
|
+
|
|
99
|
+
* `max_connections` (type: `int`, default: `64`): The maximum number of connections to keep in the pool.
|
|
100
|
+
|
|
101
|
+
*Environment variable* for `Settings.database.max_connections`: `COCOINDEX_DATABASE_MAX_CONNECTIONS`
|
|
102
|
+
|
|
103
|
+
* `min_connections` (type: `int`, default: `16`): The minimum number of connections to keep in the pool.
|
|
104
|
+
|
|
105
|
+
*Environment variable* for `Settings.database.min_connections`: `COCOINDEX_DATABASE_MIN_CONNECTIONS`
|
|
96
106
|
|
|
97
|
-
:::
|
|
98
107
|
|
|
99
108
|
:::info
|
|
100
109
|
|
|
@@ -109,10 +118,10 @@ If you use the Postgres database hosted by [Supabase](https://supabase.com/), pl
|
|
|
109
118
|
|
|
110
119
|
`GlobalExecutionOptions` is used to configure the global execution options shared by all flows. It has the following fields:
|
|
111
120
|
|
|
112
|
-
* `source_max_inflight_rows` (type: `int`,
|
|
113
|
-
* `source_max_inflight_bytes` (type: `int`,
|
|
121
|
+
* `source_max_inflight_rows` (type: `int | None`, default: `256`): The maximum number of concurrent inflight rows for all source operations.
|
|
122
|
+
* `source_max_inflight_bytes` (type: `int | None`, default: `None`): The maximum number of concurrent inflight bytes for all source operations.
|
|
114
123
|
|
|
115
|
-
See also [flow definition docs](/docs/core/flow_def#control-processing-concurrency) to control processing concurrency on per-source basis.
|
|
124
|
+
See also [flow definition docs](/docs/core/flow_def#control-processing-concurrency) about why it's necessary to control processing concurrency, and how to configure it on per-source basis.
|
|
116
125
|
If both global and per-source limits are specified, both need to be satisfied to admit additional source rows.
|
|
117
126
|
|
|
118
127
|
## List of Environment Variables
|
|
@@ -125,5 +134,7 @@ This is the list of environment variables, each of which has a corresponding fie
|
|
|
125
134
|
| `COCOINDEX_DATABASE_URL` | `database.url` | Yes |
|
|
126
135
|
| `COCOINDEX_DATABASE_USER` | `database.user` | No |
|
|
127
136
|
| `COCOINDEX_DATABASE_PASSWORD` | `database.password` | No |
|
|
128
|
-
| `
|
|
137
|
+
| `COCOINDEX_DATABASE_MAX_CONNECTIONS` | `database.max_connections` | No (default: `64`) |
|
|
138
|
+
| `COCOINDEX_DATABASE_MIN_CONNECTIONS` | `database.min_connections` | No (default: `16`) |
|
|
139
|
+
| `COCOINDEX_SOURCE_MAX_INFLIGHT_ROWS` | `global_execution_options.source_max_inflight_rows` | No (default: `256`) |
|
|
129
140
|
| `COCOINDEX_SOURCE_MAX_INFLIGHT_BYTES` | `global_execution_options.source_max_inflight_bytes` | No |
|
|
@@ -9,12 +9,12 @@ description: CocoIndex Built-in Functions
|
|
|
9
9
|
|
|
10
10
|
`ParseJson` parses a given text to JSON.
|
|
11
11
|
|
|
12
|
-
|
|
12
|
+
Input data:
|
|
13
13
|
|
|
14
|
-
* `text` (
|
|
15
|
-
* `language` (`
|
|
14
|
+
* `text` (*Str*): The source text to parse.
|
|
15
|
+
* `language` (*Str?*, default: `"json"`): The language of the source text. Only `json` is supported now.
|
|
16
16
|
|
|
17
|
-
Return: *Json
|
|
17
|
+
Return: *Json*, the parsed JSON object.
|
|
18
18
|
|
|
19
19
|
## SplitRecursively
|
|
20
20
|
|
|
@@ -37,7 +37,7 @@ Input data:
|
|
|
37
37
|
|
|
38
38
|
* `text` (*Str*): The text to split.
|
|
39
39
|
* `chunk_size` (*Int64*): The maximum size of each chunk, in bytes.
|
|
40
|
-
* `min_chunk_size` (*Int64*,
|
|
40
|
+
* `min_chunk_size` (*Int64*, default: `chunk_size / 2`): The minimum size of each chunk, in bytes.
|
|
41
41
|
|
|
42
42
|
:::note
|
|
43
43
|
|
|
@@ -48,8 +48,8 @@ Input data:
|
|
|
48
48
|
|
|
49
49
|
:::
|
|
50
50
|
|
|
51
|
-
* `chunk_overlap` (*Int64
|
|
52
|
-
* `language` (*Str*,
|
|
51
|
+
* `chunk_overlap` (*Int64?*, default: *Null*): The maximum overlap size between adjacent chunks, in bytes.
|
|
52
|
+
* `language` (*Str*, default: `""`): The language of the document.
|
|
53
53
|
Can be a language name (e.g. `Python`, `Javascript`, `Markdown`) or a file extension (e.g. `.py`, `.js`, `.md`).
|
|
54
54
|
|
|
55
55
|
|
|
@@ -61,7 +61,7 @@ Input data:
|
|
|
61
61
|
* `custom_languages` in the spec, against the `language_name` or `aliases` field of each entry.
|
|
62
62
|
* Builtin languages (see [Supported Languages](#supported-languages) section below), against the language, aliases or file extensions of each entry.
|
|
63
63
|
|
|
64
|
-
All matches are in a case-insensitive manner.
|
|
64
|
+
All matches are in a case-insensitive manner.
|
|
65
65
|
|
|
66
66
|
* If no match is found, the input will be treated as plain text.
|
|
67
67
|
|
|
@@ -185,7 +185,7 @@ Not all LLM APIs support text embedding. See the [LLM API Types table](/docs/ai/
|
|
|
185
185
|
|
|
186
186
|
Input data:
|
|
187
187
|
|
|
188
|
-
* `text` (*Str
|
|
188
|
+
* `text` (*Str*): The text to embed.
|
|
189
189
|
|
|
190
190
|
Return: *Vector[Float32, N]*, where *N* is the dimension of the embedding vector determined by the model.
|
|
191
191
|
|
|
@@ -4,7 +4,7 @@ version = "0.1.0"
|
|
|
4
4
|
description = "Image search examples for cocoindex: CLIP and ColPali-based embedding."
|
|
5
5
|
requires-python = ">=3.11"
|
|
6
6
|
dependencies = [
|
|
7
|
-
"cocoindex[colpali]>=0.1.
|
|
7
|
+
"cocoindex[colpali]>=0.1.76",
|
|
8
8
|
"python-dotenv>=1.0.1",
|
|
9
9
|
"fastapi>=0.100.0",
|
|
10
10
|
"torch>=2.0.0",
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# Build visual document index from PDFs and images with ColPali
|
|
2
|
+
[](https://github.com/cocoindex-io/cocoindex)
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
In this example, we build a visual document indexing flow using ColPali for embedding PDFs and images. and query the index with natural language.
|
|
6
|
+
|
|
7
|
+
We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful.
|
|
8
|
+
|
|
9
|
+
## Steps
|
|
10
|
+
### Indexing Flow
|
|
11
|
+
|
|
12
|
+
1. We ingest a list of PDF files and image files from the `source_files` directory.
|
|
13
|
+
2. For each file:
|
|
14
|
+
- **PDF files**: convert each page to a high-resolution image (300 DPI)
|
|
15
|
+
- **Image files**: use the image directly
|
|
16
|
+
- Generate visual embeddings for each page/image using ColPali model
|
|
17
|
+
3. We will save the embeddings and metadata in Qdrant vector database.
|
|
18
|
+
|
|
19
|
+
### Query
|
|
20
|
+
We will match against user-provided natural language text using ColPali's text-to-visual embedding capability, enabling semantic search across visual document content.
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
## Prerequisite
|
|
25
|
+
[Install Qdrant](https://qdrant.tech/documentation/guides/installation/) if you don't have one running locally.
|
|
26
|
+
|
|
27
|
+
You can start Qdrant with Docker:
|
|
28
|
+
```bash
|
|
29
|
+
docker run -p 6333:6333 -p 6334:6334 qdrant/qdrant
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Run
|
|
33
|
+
|
|
34
|
+
Install dependencies:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install -e .
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Setup:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
cocoindex setup main.py
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Update index:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
cocoindex update main.py
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Run:
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
python main.py
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## About ColPali
|
|
59
|
+
This example uses [ColPali](https://github.com/illuin-tech/colpali), a state-of-the-art vision-language model that enables:
|
|
60
|
+
- Direct visual understanding of document layouts, tables, and figures
|
|
61
|
+
- Natural language queries against visual document content
|
|
62
|
+
- No need for OCR or text extraction - works directly with document images
|
|
63
|
+
|
|
64
|
+
## CocoInsight
|
|
65
|
+
I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline. It just connects to your local CocoIndex server, with Zero pipeline data retention. Run following command to start CocoInsight:
|
|
66
|
+
|
|
67
|
+
```
|
|
68
|
+
cocoindex server -ci main.py
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight).
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
import cocoindex
|
|
2
|
+
import os
|
|
3
|
+
import mimetypes
|
|
4
|
+
|
|
5
|
+
from dotenv import load_dotenv
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pdf2image import convert_from_bytes
|
|
8
|
+
from io import BytesIO
|
|
9
|
+
|
|
10
|
+
from qdrant_client import QdrantClient
|
|
11
|
+
|
|
12
|
+
QDRANT_GRPC_URL = "http://localhost:6334"
|
|
13
|
+
QDRANT_COLLECTION = "MultiFormatIndexings"
|
|
14
|
+
COLPALI_MODEL_NAME = os.getenv("COLPALI_MODEL", "vidore/colpali-v1.2")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class Page:
|
|
19
|
+
page_number: int | None
|
|
20
|
+
image: bytes
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@cocoindex.op.function()
|
|
24
|
+
def file_to_pages(filename: str, content: bytes) -> list[Page]:
|
|
25
|
+
"""
|
|
26
|
+
Classify file content based on MIME type detection.
|
|
27
|
+
Returns ClassifiedFileContent with appropriate field populated based on file type.
|
|
28
|
+
"""
|
|
29
|
+
# Guess the MIME type based on the filename
|
|
30
|
+
mime_type, _ = mimetypes.guess_type(filename)
|
|
31
|
+
|
|
32
|
+
if mime_type == "application/pdf":
|
|
33
|
+
images = convert_from_bytes(content, dpi=300)
|
|
34
|
+
pages = []
|
|
35
|
+
for i, image in enumerate(images):
|
|
36
|
+
with BytesIO() as buffer:
|
|
37
|
+
image.save(buffer, format="PNG")
|
|
38
|
+
pages.append(Page(page_number=i + 1, image=buffer.getvalue()))
|
|
39
|
+
return pages
|
|
40
|
+
elif mime_type and mime_type.startswith("image/"):
|
|
41
|
+
return [Page(page_number=None, image=content)]
|
|
42
|
+
else:
|
|
43
|
+
return []
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
qdrant_connection = cocoindex.add_auth_entry(
|
|
47
|
+
"qdrant_connection",
|
|
48
|
+
cocoindex.targets.QdrantConnection(grpc_url=QDRANT_GRPC_URL),
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@cocoindex.flow_def(name="MultiFormatIndexing")
|
|
53
|
+
def multi_format_indexing_flow(
|
|
54
|
+
flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
|
|
55
|
+
) -> None:
|
|
56
|
+
"""
|
|
57
|
+
Define an example flow that embeds files into a vector database.
|
|
58
|
+
"""
|
|
59
|
+
data_scope["documents"] = flow_builder.add_source(
|
|
60
|
+
cocoindex.sources.LocalFile(path="source_files", binary=True)
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
output_embeddings = data_scope.add_collector()
|
|
64
|
+
|
|
65
|
+
with data_scope["documents"].row() as doc:
|
|
66
|
+
doc["pages"] = flow_builder.transform(
|
|
67
|
+
file_to_pages, filename=doc["filename"], content=doc["content"]
|
|
68
|
+
)
|
|
69
|
+
with doc["pages"].row() as page:
|
|
70
|
+
page["embedding"] = page["image"].transform(
|
|
71
|
+
cocoindex.functions.ColPaliEmbedImage(model=COLPALI_MODEL_NAME)
|
|
72
|
+
)
|
|
73
|
+
output_embeddings.collect(
|
|
74
|
+
id=cocoindex.GeneratedField.UUID,
|
|
75
|
+
filename=doc["filename"],
|
|
76
|
+
page=page["page_number"],
|
|
77
|
+
embedding=page["embedding"],
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
output_embeddings.export(
|
|
81
|
+
"multi_format_indexings",
|
|
82
|
+
cocoindex.targets.Qdrant(
|
|
83
|
+
connection=qdrant_connection,
|
|
84
|
+
collection_name=QDRANT_COLLECTION,
|
|
85
|
+
),
|
|
86
|
+
primary_key_fields=["id"],
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@cocoindex.transform_flow()
|
|
91
|
+
def query_to_colpali_embedding(
|
|
92
|
+
text: cocoindex.DataSlice[str],
|
|
93
|
+
) -> cocoindex.DataSlice[list[list[float]]]:
|
|
94
|
+
return text.transform(
|
|
95
|
+
cocoindex.functions.ColPaliEmbedQuery(model=COLPALI_MODEL_NAME)
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _main() -> None:
|
|
100
|
+
# Initialize Qdrant client
|
|
101
|
+
client = QdrantClient(url=QDRANT_GRPC_URL, prefer_grpc=True)
|
|
102
|
+
|
|
103
|
+
# Run queries in a loop to demonstrate the query capabilities.
|
|
104
|
+
while True:
|
|
105
|
+
query = input("Enter search query (or Enter to quit): ")
|
|
106
|
+
if query == "":
|
|
107
|
+
break
|
|
108
|
+
|
|
109
|
+
# Get the embedding for the query
|
|
110
|
+
query_embedding = query_to_colpali_embedding.eval(query)
|
|
111
|
+
|
|
112
|
+
search_results = client.query_points(
|
|
113
|
+
collection_name=QDRANT_COLLECTION,
|
|
114
|
+
query=query_embedding, # Multi-vector format: list[list[float]]
|
|
115
|
+
using="embedding", # Specify the vector field name
|
|
116
|
+
limit=5,
|
|
117
|
+
with_payload=True,
|
|
118
|
+
)
|
|
119
|
+
print("\nSearch results:")
|
|
120
|
+
for result in search_results.points:
|
|
121
|
+
score = result.score
|
|
122
|
+
payload = result.payload
|
|
123
|
+
if payload is None:
|
|
124
|
+
continue
|
|
125
|
+
page_number = payload["page"]
|
|
126
|
+
page_number_str = f"Page:{page_number}" if page_number is not None else ""
|
|
127
|
+
print(f"[{score:.3f}] {payload['filename']} {page_number_str}")
|
|
128
|
+
print("---")
|
|
129
|
+
print()
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
if __name__ == "__main__":
|
|
133
|
+
load_dotenv()
|
|
134
|
+
cocoindex.init()
|
|
135
|
+
_main()
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "pdf-embedding"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Simple example for cocoindex: build embedding index based on local PDF files."
|
|
5
|
+
requires-python = ">=3.11"
|
|
6
|
+
dependencies = [
|
|
7
|
+
"cocoindex[colpali]>=0.1.76",
|
|
8
|
+
"python-dotenv>=1.0.1",
|
|
9
|
+
"pdf2image>=1.17.0",
|
|
10
|
+
"qdrant-client>=1.15.0",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
[tool.setuptools]
|
|
14
|
+
packages = []
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|