lindera 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: a8fae246f910033fc6c1b40d3ba6d7cf36df5911b7bd8d84eac97e77f3a9eb29
4
+ data.tar.gz: 248e5eeb5a3824e05ebabd104579c558a36a53a7c0b82b4b314b68b9f971459f
5
+ SHA512:
6
+ metadata.gz: 00a9bf21d2b3a68119ff442e64e42b0de5f1579be4c631450c5715ac1a3e1b5d68ae46757e03fd6140661e49633d2f76bea34fbb7965a3da9fc54b1b440a659a
7
+ data.tar.gz: 6e2b6887e847433f64751770f47eca868aa3cabcf0c91666618169399abf4746962369ae6d22acdb508a8a0ff62a809eaed2a63722258f77874fb139d52ba51d
data/Cargo.toml ADDED
@@ -0,0 +1,40 @@
1
+ [package]
2
+ name = "lindera-ruby"
3
+ version = { workspace = true }
4
+ edition = { workspace = true }
5
+ description = "A Ruby binding for Lindera."
6
+ documentation = "https://docs.rs/lindera-ruby"
7
+ homepage = { workspace = true }
8
+ repository = { workspace = true }
9
+ readme = "README.md"
10
+ keywords = ["morphological", "analysis", "library", "ruby", "binding"]
11
+ categories = { workspace = true }
12
+ license = { workspace = true }
13
+
14
+ [lib]
15
+ name = "lindera_ruby"
16
+ crate-type = ["cdylib", "lib"]
17
+
18
+ [features]
19
+ embed-ipadic = ["lindera/embed-ipadic"] # Include Japanese dictionary (IPADIC)
20
+ embed-ipadic-neologd = [
21
+ "lindera/embed-ipadic-neologd",
22
+ ] # Include Japanese dictionary (IPADIC NEologd)
23
+ embed-unidic = ["lindera/embed-unidic"] # Include Japanese dictionary (UniDic)
24
+ embed-ko-dic = ["lindera/embed-ko-dic"] # Include Korean dictionary (ko-dic)
25
+ embed-cc-cedict = [
26
+ "lindera/embed-cc-cedict",
27
+ ] # Include Chinese dictionary (CC-CEDICT)
28
+ embed-jieba = ["lindera/embed-jieba"] # Include Chinese dictionary (Jieba)
29
+ embed-cjk = [
30
+ "lindera/embed-cjk",
31
+ ] # Embed CJK dictionaries (IPADIC, ko-dic, Jieba)
32
+ train = ["lindera/train"] # Enable training functionality
33
+ default = ["train"] # No directories included
34
+
35
+ [dependencies]
36
+ lindera = { workspace = true }
37
+ magnus = { version = "0.8.2", features = ["rb-sys"] }
38
+ num_cpus = { workspace = true }
39
+ serde = { workspace = true, features = ["derive"] }
40
+ serde_json = { workspace = true }
data/README.md ADDED
@@ -0,0 +1,78 @@
1
+ # lindera-ruby
2
+
3
+ Ruby binding for [Lindera](https://github.com/lindera/lindera), a morphological analysis engine for CJK text.
4
+
5
+ ## Overview
6
+
7
+ lindera-ruby provides a Ruby interface to the Lindera morphological analysis engine, supporting Japanese, Korean, and Chinese text analysis.
8
+
9
+ - **Multi-language Support**: Japanese (IPADIC, IPADIC-NEologd, UniDic), Korean (ko-dic), Chinese (CC-CEDICT, Jieba)
10
+ - **Character Filters**: Text preprocessing with mapping, regex, Unicode normalization, and Japanese iteration mark handling
11
+ - **Token Filters**: Post-processing filters including lowercase, length filtering, stop words, and Japanese-specific filters
12
+ - **Flexible Configuration**: Configurable tokenization modes and penalty settings
13
+ - **Metadata Support**: Complete dictionary schema and metadata management
14
+ - **Training & Export** (optional): Train custom morphological analysis models from corpus data
15
+
16
+ ## Requirements
17
+
18
+ - Ruby >= 3.1
19
+ - Rust >= 1.85
20
+
21
+ ## Dictionary
22
+
23
+ Pre-built dictionaries are available from [GitHub Releases](https://github.com/lindera/lindera/releases).
24
+ Download a dictionary archive (e.g. `lindera-ipadic-*.zip`) and extract it to a local path.
25
+
26
+ ## Install
27
+
28
+ ```bash
29
+ cd lindera-ruby
30
+ bundle install
31
+ bundle exec rake compile
32
+ ```
33
+
34
+ ## Usage
35
+
36
+ ```ruby
37
+ require "lindera"
38
+
39
+ # Load dictionary from a local path (download from GitHub Releases)
40
+ dictionary = Lindera.load_dictionary("/path/to/ipadic")
41
+
42
+ # Create a tokenizer
43
+ tokenizer = Lindera::Tokenizer.new(dictionary, "normal", nil)
44
+
45
+ # Tokenize text
46
+ tokens = tokenizer.tokenize("関西国際空港")
47
+ tokens.each do |token|
48
+ puts "#{token.surface}: #{token.details&.join(', ')}"
49
+ end
50
+ ```
51
+
52
+ ### Using TokenizerBuilder
53
+
54
+ ```ruby
55
+ require "lindera"
56
+
57
+ builder = Lindera::TokenizerBuilder.new
58
+ builder.set_mode("normal")
59
+ builder.set_dictionary("/path/to/ipadic")
60
+
61
+ # Add filters
62
+ builder.append_character_filter("unicode_normalize", { "kind" => "nfkc" })
63
+ builder.append_token_filter("lowercase", nil)
64
+
65
+ tokenizer = builder.build
66
+ tokens = tokenizer.tokenize("テスト")
67
+ ```
68
+
69
+ ## Test
70
+
71
+ ```bash
72
+ bundle exec rake compile
73
+ bundle exec rake test
74
+ ```
75
+
76
+ ## License
77
+
78
+ MIT
@@ -0,0 +1,8 @@
1
+ require "rb_sys/mkmf"
2
+
3
+ create_rust_makefile("lindera/lindera_ruby") do |r|
4
+ # Pass embed features via LINDERA_FEATURES environment variable
5
+ # e.g., LINDERA_FEATURES="embed-ipadic" bundle exec rake compile
6
+ features = ENV.fetch("LINDERA_FEATURES", "").split(",").map(&:strip).reject(&:empty?)
7
+ r.features = features unless features.empty?
8
+ end
data/lib/lindera.rb ADDED
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ begin
4
+ require 'lindera/lindera_ruby'
5
+ rescue LoadError
6
+ require 'lindera_ruby'
7
+ end
metadata ADDED
@@ -0,0 +1,100 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: lindera
3
+ version: !ruby/object:Gem::Version
4
+ version: 3.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Lindera contributors
8
+ bindir: bin
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: rb_sys
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - "~>"
17
+ - !ruby/object:Gem::Version
18
+ version: '0.9'
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - "~>"
24
+ - !ruby/object:Gem::Version
25
+ version: '0.9'
26
+ - !ruby/object:Gem::Dependency
27
+ name: minitest
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - "~>"
31
+ - !ruby/object:Gem::Version
32
+ version: '5.0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '5.0'
40
+ - !ruby/object:Gem::Dependency
41
+ name: rake
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '13.0'
47
+ type: :development
48
+ prerelease: false
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '13.0'
54
+ - !ruby/object:Gem::Dependency
55
+ name: rake-compiler
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: '1.2'
61
+ type: :development
62
+ prerelease: false
63
+ version_requirements: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - "~>"
66
+ - !ruby/object:Gem::Version
67
+ version: '1.2'
68
+ description: Ruby bindings for Lindera, a morphological analysis library for CJK text
69
+ (Japanese, Korean, Chinese).
70
+ executables: []
71
+ extensions:
72
+ - ext/lindera_ruby/extconf.rb
73
+ extra_rdoc_files: []
74
+ files:
75
+ - Cargo.toml
76
+ - README.md
77
+ - ext/lindera_ruby/extconf.rb
78
+ - lib/lindera.rb
79
+ homepage: https://github.com/lindera/lindera
80
+ licenses:
81
+ - MIT
82
+ metadata: {}
83
+ rdoc_options: []
84
+ require_paths:
85
+ - lib
86
+ required_ruby_version: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ version: '3.1'
91
+ required_rubygems_version: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ version: '0'
96
+ requirements: []
97
+ rubygems_version: 4.0.6
98
+ specification_version: 4
99
+ summary: Ruby bindings for Lindera morphological analysis engine
100
+ test_files: []