npm - @agentor/dashscope - Versions diffs - 0.0.0 → 0.0.2 - Mend

@agentor/dashscope 0.0.0 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -10,6 +10,12 @@
 - **Chat Completions API** - Standard `/chat/completions` with function calling, streaming, and reasoning
 - **Responses API** - `/responses` endpoint with built-in tools support
+- **Embedding** - Text vectorization via OpenAI-compatible `/embeddings` endpoint
+- **Reranking** - Document reranking via `/reranks` endpoint
+- **Image Generation** - Text-to-image via multimodal generation endpoint
+- **Video Generation** - Text-to-video and image-to-video with async polling
+- **Speech Synthesis** - Text-to-speech for CosyVoice and Qwen-TTS models
+- **Transcription** - Speech-to-text for short and long audio
 - **Built-in Tools** - Web search, code interpreter, web extractor, file search, image search, MCP integration
 - **Thinking Mode** - Enable reasoning/thinking with configurable budget
 - **Multi-region** - Beijing, Singapore, US, Germany regions
@@ -245,20 +251,277 @@ const first = await generateText({
 });
 ```
+## Embedding
+```typescript
+import { embed, embedMany } from "ai";
+// Single text embedding
+const { embedding, usage } = await embed({
+  model: dashscope.embeddingModel("text-embedding-v4"),
+  value: "The clothes quality is excellent",
+});
+console.log(embedding.length); // 1024 (default dimensions)
+// Batch embedding
+const { embeddings } = await embedMany({
+  model: dashscope.embeddingModel("text-embedding-v4"),
+  values: ["Hello world", "Machine learning is fascinating"],
+});
+```
+### Custom Dimensions
+```typescript
+const { embedding } = await embed({
+  model: dashscope.embeddingModel("text-embedding-v4"),
+  value: "Custom dimension embedding",
+  providerOptions: {
+    openaiCompatible: {
+      dimensions: 256,
+    },
+  },
+});
+console.log(embedding.length); // 256
+```
+## Reranking
+```typescript
+import { rerank } from "ai";
+const { ranking } = await rerank({
+  model: dashscope.rerankingModel("qwen3-rerank"),
+  query: "What is a reranking model?",
+  documents: [
+    "Reranking models sort candidate texts by relevance",
+    "Quantum computing is a frontier field",
+    "Pre-trained models brought advances to reranking",
+  ],
+});
+for (const item of ranking) {
+  console.log(`Index: ${item.originalIndex}, Score: ${item.score}`);
+}
+```
+### Top N Results
+```typescript
+const { ranking } = await rerank({
+  model: dashscope.rerankingModel("qwen3-rerank"),
+  query: "How to reset password?",
+  documents: [
+    "Go to Settings > Security > Change Password",
+    "Forgot your password?",
+    "Two-factor authentication is supported",
+  ],
+  topN: 2,
+});
+```
+## Image Generation
+```typescript
+import { generateImage } from "ai";
+const { images } = await generateImage({
+  model: dashscope.imageModel("qwen-image-plus"),
+  prompt: "A cute cat sitting on a windowsill with sunlight streaming in",
+  providerOptions: {
+    dashscope: {
+      size: "1024*1024",
+    },
+  },
+});
+// images[0].uint8Array — raw image data
+// images[0].base64 — base64 encoded image
+```
+## Video Generation
+```typescript
+import { experimental_generateVideo as generateVideo } from "ai";
+// Text-to-video
+const { videos } = await generateVideo({
+  model: dashscope.videoModel("wan2.6-t2v"),
+  prompt: "A golden retriever running through a field of sunflowers",
+  providerOptions: {
+    dashscope: {
+      size: "1280*720",
+      duration: 5,
+    },
+  },
+});
+```
+### Image-to-Video
+Use a model ID containing `-i2v` for image-to-video mode:
+```typescript
+const { videos } = await generateVideo({
+  model: dashscope.videoModel("wan2.6-i2v-turbo"),
+  prompt: "The cat stretches and walks away",
+  providerOptions: {
+    dashscope: {
+      resolution: "720P",
+    },
+  },
+  image: "data:image/png;base64,...", // or a URL string
+});
+```
+## Speech Synthesis (TTS)
+```typescript
+import { experimental_generateSpeech as generateSpeech } from "ai";
+import { writeFileSync } from "fs";
+const { audio } = await generateSpeech({
+  model: dashscope.speechModel("cosyvoice-v3-flash"),
+  text: "Hello, welcome to Agentor.",
+  providerOptions: {
+    dashscope: {
+      voice: "longanyang",
+      format: "wav",
+      sampleRate: 24000,
+    },
+  },
+});
+writeFileSync("output.wav", audio.uint8Array);
+```
+## Transcription (Speech-to-Text)
+### Short Audio (Sync)
+```typescript
+import { experimental_transcribe as transcribe } from "ai";
+const { text } = await transcribe({
+  model: dashscope.transcriptionModel("qwen3-asr-flash"),
+  audio: new URL("https://example.com/audio.mp3"),
+});
+console.log(text);
+```
+### Long Audio (Async)
+For async models, provide the audio URL via `providerOptions`:
+```typescript
+const { text, segments } = await transcribe({
+  model: dashscope.transcriptionModel("qwen3-asr-flash-filetrans"),
+  audio: new Uint8Array(0), // placeholder
+  providerOptions: {
+    dashscope: {
+      fileUrl: "https://example.com/long-audio.mp3",
+      enableWords: true,
+    },
+  },
+});
+```
 ## Provider Configuration
 ```typescript
 import { createDashScope } from "@agentor/dashscope";
 const dashscope = createDashScope({
-  apiKey: "sk-xxx",                       // or set DASHSCOPE_API_KEY env var
-  region: "beijing",                       // beijing | singapore | us | germany
-  workspaceId: "ws-xxx",                   // required for germany region
-  baseURL: "https://custom-endpoint.com",  // override default base URL
+  apiKey: "sk-xxx", // or set DASHSCOPE_API_KEY env var
+  region: "beijing", // beijing | singapore | us | germany
+  workspaceId: "ws-xxx", // required for germany region
+  baseURL: "https://custom-endpoint.com", // override default base URL
   headers: { "X-Custom-Header": "value" }, // custom headers
 });
 ```
+## Available Models
+> For the complete and up-to-date model list, see [Alibaba Cloud Model Studio](https://help.aliyun.com/zh/model-studio/models).
+### Language Models (Chat)
+| Model                 | Description                               |
+| --------------------- | ----------------------------------------- |
+| `qwen3.6-max-preview` | Flagship model with strongest reasoning   |
+| `qwen3.6-plus`        | Recommended, balanced capability and cost |
+| `qwen3.6-flash`       | Fastest, ultra-low cost                   |
+| `qwen3.5-plus`        | Enhanced reasoning model                  |
+| `qwen3.5-flash`       | Fast and efficient model                  |
+| `qwen3-coder-plus`    | Code-optimized model                      |
+| `qwen3-coder-flash`   | Fast code model                           |
+| `qwq-plus`            | Dedicated reasoning model                 |
+| `deepseek-v4-pro`     | DeepSeek V4 Pro                           |
+| `deepseek-v4-flash`   | DeepSeek V4 Flash                         |
+| `kimi-k2.6`           | Moonshot Kimi K2.6                        |
+| `glm-5.1`             | Zhipu GLM 5.1                             |
+### Embedding Models
+| Model                          | Dimensions              | Description                         |
+| ------------------------------ | ----------------------- | ----------------------------------- |
+| `text-embedding-v4`            | 64-2048 (default 1024)  | Text embedding for search/RAG       |
+| `text-embedding-v3`            | 512-1024 (default 1024) | Legacy text embedding               |
+| `qwen3-vl-embedding`           | 256-2560 (default 2560) | Multimodal (text + image) embedding |
+| `tongyi-embedding-vision-plus` | 64-1152 (default 1152)  | Cross-modal search embedding        |
+### Reranking Models
+| Model             | Description                             |
+| ----------------- | --------------------------------------- |
+| `qwen3-rerank`    | Text reranking, 100+ languages          |
+| `qwen3-vl-rerank` | Multimodal reranking (text/image/video) |
+| `gte-rerank-v2`   | Semantic text reranking                 |
+### Image Models
+| Model                | Description                                  |
+| -------------------- | -------------------------------------------- |
+| `wan2.7-image-pro`   | Latest Wan image generation, up to 4096x4096 |
+| `wan2.7-image`       | Wan image generation, up to 2048x2048        |
+| `qwen-image-2.0-pro` | Qwen image generation and editing            |
+| `qwen-image-max`     | High quality image generation                |
+| `qwen-image-plus`    | Enhanced image generation                    |
+| `z-image-turbo`      | Fast image generation                        |
+### Video Models
+| Model              | Mode | Description                           |
+| ------------------ | ---- | ------------------------------------- |
+| `wan2.7-t2v`       | T2V  | Recommended text-to-video with audio  |
+| `wan2.6-t2v`       | T2V  | Text-to-video with audio              |
+| `wan2.2-t2v-plus`  | T2V  | Text-to-video (silent)                |
+| `wan2.7-i2v`       | I2V  | Recommended image-to-video with audio |
+| `wan2.6-i2v`       | I2V  | Image-to-video with audio             |
+| `wan2.6-i2v-flash` | I2V  | Fast image-to-video                   |
+### Speech Models (TTS)
+| Model                      | Description                        |
+| -------------------------- | ---------------------------------- |
+| `cosyvoice-v3.5-plus`      | Latest flagship, best quality      |
+| `cosyvoice-v3.5-flash`     | Latest lightweight                 |
+| `cosyvoice-v3-plus`        | V3 enhanced                        |
+| `cosyvoice-v3-flash`       | V3 fast synthesis                  |
+| `qwen3-tts-flash-realtime` | Qwen TTS with 17 human-like voices |
+### Transcription Models (STT)
+| Model                       | Mode  | Description                    |
+| --------------------------- | ----- | ------------------------------ |
+| `qwen3-asr-flash`           | Sync  | Short audio (up to 5 min)      |
+| `qwen3-asr-flash-filetrans` | Async | Long audio (up to 12 hours)    |
+| `fun-asr`                   | Async | Speaker diarization, hot words |
+| `paraformer-v2`             | Async | Legacy async transcription     |
 ## License
 MIT © [Demo Macro](https://www.demomacro.com/)

package/dist/index.d.mts CHANGED Viewed

@@ -1,6 +1,7 @@
+import { OpenAICompatibleEmbeddingModel } from "@ai-sdk/openai-compatible";
 import * as _$_ai_sdk_provider_utils0 from "@ai-sdk/provider-utils";
 import { FetchFunction } from "@ai-sdk/provider-utils";
-import { LanguageModelV3 } from "@ai-sdk/provider";
+import { EmbeddingModelV3, Experimental_VideoModelV3, Experimental_VideoModelV3CallOptions, ImageModelV3, ImageModelV3CallOptions, LanguageModelV3, RerankingModelV3, RerankingModelV3CallOptions, SharedV3Warning, SpeechModelV3, SpeechModelV3CallOptions, TranscriptionModelV3, TranscriptionModelV3CallOptions } from "@ai-sdk/provider";
 //#region src/tools.d.ts
 declare const webSearchToolFactory: _$_ai_sdk_provider_utils0.ProviderToolFactoryWithOutputSchema<Record<string, never>, {
@@ -131,16 +132,12 @@ type DashScopeResponsesTools = typeof responsesTools;
 //#endregion
 //#region src/types.d.ts
 type DashScopeRegion = "beijing" | "singapore" | "us" | "germany";
-declare const DASHSCOPE_REGION_BASE_URLS: Record<DashScopeRegion, {
-  baseURL: string;
-  videoBaseURL: string;
-}>;
+declare const DASHSCOPE_REGION_URLS: Record<DashScopeRegion, string>;
 interface DashScopeProviderSettings {
   apiKey?: string;
   region?: DashScopeRegion;
   workspaceId?: string;
   baseURL?: string;
-  videoBaseURL?: string;
   headers?: Record<string, string>;
   fetch?: FetchFunction;
   includeUsage?: boolean;
@@ -164,13 +161,6 @@ interface DashScopeChatOptions {
   /** Enable code interpreter (requires enableThinking). */
   enableCodeInterpreter?: boolean;
 }
-interface DashScopeChatConfig {
-  provider: string;
-  baseURL: string;
-  headers: () => Record<string, string>;
-  fetch?: FetchFunction;
-  includeUsage?: boolean;
-}
 interface DashScopeResponsesOptions {
   enableThinking?: boolean;
   reasoning?: {
@@ -188,6 +178,12 @@ interface DashScopeResponsesNamespace {
 interface DashScopeProvider {
   (modelId: string): LanguageModelV3;
   languageModel(modelId: string): LanguageModelV3;
+  embeddingModel(modelId: string): EmbeddingModelV3;
+  rerankingModel(modelId: string): RerankingModelV3;
+  imageModel(modelId: string): ImageModelV3;
+  videoModel(modelId: string): Experimental_VideoModelV3;
+  speechModel(modelId: string): SpeechModelV3;
+  transcriptionModel(modelId: string): TranscriptionModelV3;
   chatOptions: (options: DashScopeChatOptions) => {
     providerOptions: {
       dashscope: DashScopeChatOptions;
@@ -201,10 +197,208 @@ interface DashScopeProvider {
   responses: DashScopeResponsesNamespace;
 }
 //#endregion
+//#region src/utils.d.ts
+interface DashScopeConfig {
+  provider: string;
+  baseURL: string;
+  headers: () => Record<string, string>;
+  fetch?: FetchFunction;
+  includeUsage?: boolean;
+}
+//#endregion
+//#region src/embedding.d.ts
+interface DashScopeEmbeddingOptions {
+  /** Output embedding dimensions. Supported by text-embedding-v4, text-embedding-v3, etc. */
+  dimensions?: number;
+}
+declare class DashScopeEmbeddingModel extends OpenAICompatibleEmbeddingModel {
+  constructor(modelId: string, config: DashScopeConfig);
+}
+//#endregion
+//#region src/image.d.ts
+interface DashScopeImageOptions {
+  /** Output image size, e.g. "2048*2048", "1024*1024", "1K", "2K". */
+  size?: string;
+  /** Negative prompt describing what to avoid. */
+  negativePrompt?: string;
+  /** Enable prompt extension/rewriting. Default depends on model. */
+  promptExtend?: boolean;
+  /** Add watermark. Default false. */
+  watermark?: boolean;
+  /** Number of images to generate. Default 1. */
+  n?: number;
+}
+declare class DashScopeImageModel implements ImageModelV3 {
+  readonly specificationVersion: "v3";
+  readonly modelId: string;
+  private readonly config;
+  constructor(modelId: string, config: DashScopeConfig);
+  get provider(): string;
+  get maxImagesPerCall(): number | undefined;
+  doGenerate(options: ImageModelV3CallOptions): Promise<{
+    images: string[];
+    warnings: SharedV3Warning[];
+    response: {
+      timestamp: Date;
+      modelId: string;
+      headers: Record<string, string> | undefined;
+    };
+  }>;
+}
+//#endregion
+//#region src/rerank.d.ts
+interface DashScopeRerankOptions {
+  /** English instruction to guide the reranking strategy. */
+  instruct?: string;
+}
+declare class DashScopeRerankingModel implements RerankingModelV3 {
+  readonly specificationVersion: "v3";
+  readonly modelId: string;
+  private readonly config;
+  constructor(modelId: string, config: DashScopeConfig);
+  get provider(): string;
+  doRerank(options: RerankingModelV3CallOptions): Promise<{
+    ranking: {
+      index: number;
+      relevanceScore: number;
+    }[];
+    warnings: SharedV3Warning[];
+    response: {
+      id: string | undefined;
+      modelId: string | undefined;
+      headers: Record<string, string> | undefined;
+    };
+  }>;
+}
+//#endregion
+//#region src/speech.d.ts
+interface DashScopeSpeechOptions {
+  /** Voice name. Model-specific, e.g. "longanyang" for CosyVoice, "Cherry" for Qwen-TTS. */
+  voice?: string;
+  /** Output audio format: "wav", "mp3", "pcm". Default depends on model. */
+  format?: string;
+  /** Sample rate. Default depends on model. */
+  sampleRate?: number;
+  /** Language type for Qwen-TTS: "Chinese" | "English" | "Japanese" | etc. */
+  languageType?: string;
+  /** Speaking speed. 0.5-2.0, default 1.0. */
+  speed?: number;
+  /** Volume. 0.5-2.0, default 1.0. */
+  volume?: number;
+  /** Pitch. -12 to 12, default 0. */
+  pitch?: number;
+}
+declare class DashScopeSpeechModel implements SpeechModelV3 {
+  readonly specificationVersion: "v3";
+  readonly modelId: string;
+  private readonly config;
+  constructor(modelId: string, config: DashScopeConfig);
+  get provider(): string;
+  doGenerate(options: SpeechModelV3CallOptions): Promise<{
+    audio: Uint8Array<ArrayBuffer>;
+    warnings: SharedV3Warning[];
+    request: {
+      body: Record<string, unknown>;
+    };
+    response: {
+      timestamp: Date;
+      modelId: string;
+      headers: Record<string, string> | undefined;
+    };
+  }>;
+}
+//#endregion
+//#region src/transcription.d.ts
+interface DashScopeTranscriptionOptions {
+  /**
+   * Publicly accessible audio file URL for async transcription.
+   * Required for async models (filetrans, fun-asr, paraformer) when using long audio.
+   */
+  fileUrl?: string;
+  /** Language hint(s), e.g. ["zh", "en"]. */
+  languageHints?: string[];
+  /** Enable inverse text normalization (convert spoken numbers/dates to written form). */
+  enableItn?: boolean;
+  /** Enable word-level timestamps. */
+  enableWords?: boolean;
+  /** Channel IDs to transcribe. Default [0]. */
+  channelId?: number[];
+  /** Polling interval in ms. Default 5000. (async mode only) */
+  pollIntervalMs?: number;
+  /** Polling timeout in ms. Default 600000. (async mode only) */
+  pollTimeoutMs?: number;
+}
+declare class DashScopeTranscriptionModel implements TranscriptionModelV3 {
+  readonly specificationVersion: "v3";
+  readonly modelId: string;
+  private readonly config;
+  constructor(modelId: string, config: DashScopeConfig);
+  get provider(): string;
+  doGenerate(options: TranscriptionModelV3CallOptions): Promise<{
+    text: string;
+    segments: {
+      text: string;
+      startSecond: number;
+      endSecond: number;
+    }[];
+    language: undefined;
+    durationInSeconds: undefined;
+    warnings: SharedV3Warning[];
+    response: {
+      timestamp: Date;
+      modelId: string;
+      headers: Record<string, string> | undefined;
+    };
+  }>;
+  private doSync;
+  private doAsync;
+}
+//#endregion
+//#region src/video.d.ts
+interface DashScopeVideoOptions {
+  /** Negative prompt. */
+  negativePrompt?: string;
+  /** Enable prompt extension. */
+  promptExtend?: boolean;
+  /** Add watermark. Default false. */
+  watermark?: boolean;
+  /** Resolution for I2V: "720P" | "1080P". For T2V: use size "WIDTH*HEIGHT". */
+  resolution?: string;
+  /** Size in "WIDTH*HEIGHT" format (T2V/R2V). */
+  size?: string;
+  /** Video duration in seconds. */
+  duration?: number;
+  /** Polling interval in ms. Default 5000. */
+  pollIntervalMs?: number;
+  /** Polling timeout in ms. Default 600000. */
+  pollTimeoutMs?: number;
+}
+declare class DashScopeVideoModel implements Experimental_VideoModelV3 {
+  readonly specificationVersion: "v3";
+  readonly modelId: string;
+  private readonly config;
+  constructor(modelId: string, config: DashScopeConfig);
+  get provider(): string;
+  get maxVideosPerCall(): number | undefined;
+  doGenerate(options: Experimental_VideoModelV3CallOptions): Promise<{
+    videos: {
+      type: "url";
+      url: string;
+      mediaType: string;
+    }[];
+    warnings: SharedV3Warning[];
+    response: {
+      timestamp: Date;
+      modelId: string;
+      headers: Record<string, string> | undefined;
+    };
+  }>;
+}
+//#endregion
 //#region src/provider.d.ts
 declare function createDashScope(options?: DashScopeProviderSettings): DashScopeProvider;
 //#endregion
 //#region src/index.d.ts
 declare const dashscope: DashScopeProvider;
 //#endregion
-export { DASHSCOPE_REGION_BASE_URLS, DashScopeChatConfig, DashScopeChatOptions, DashScopeProvider, DashScopeProviderSettings, DashScopeRegion, DashScopeResponsesNamespace, DashScopeResponsesOptions, DashScopeResponsesTools, createDashScope, dashscope, responsesTools };
+export { DASHSCOPE_REGION_URLS, DashScopeChatOptions, DashScopeEmbeddingModel, DashScopeEmbeddingOptions, DashScopeImageModel, DashScopeImageOptions, DashScopeProvider, DashScopeProviderSettings, DashScopeRegion, DashScopeRerankOptions, DashScopeRerankingModel, DashScopeResponsesNamespace, DashScopeResponsesOptions, DashScopeResponsesTools, DashScopeSpeechModel, DashScopeSpeechOptions, DashScopeTranscriptionModel, DashScopeTranscriptionOptions, DashScopeVideoModel, DashScopeVideoOptions, createDashScope, dashscope, responsesTools };