lindera 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Cargo.toml +40 -0
- data/README.md +78 -0
- data/ext/lindera_ruby/extconf.rb +8 -0
- data/lib/lindera.rb +7 -0
- metadata +100 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: a8fae246f910033fc6c1b40d3ba6d7cf36df5911b7bd8d84eac97e77f3a9eb29
|
|
4
|
+
data.tar.gz: 248e5eeb5a3824e05ebabd104579c558a36a53a7c0b82b4b314b68b9f971459f
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 00a9bf21d2b3a68119ff442e64e42b0de5f1579be4c631450c5715ac1a3e1b5d68ae46757e03fd6140661e49633d2f76bea34fbb7965a3da9fc54b1b440a659a
|
|
7
|
+
data.tar.gz: 6e2b6887e847433f64751770f47eca868aa3cabcf0c91666618169399abf4746962369ae6d22acdb508a8a0ff62a809eaed2a63722258f77874fb139d52ba51d
|
data/Cargo.toml
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
[package]
|
|
2
|
+
name = "lindera-ruby"
|
|
3
|
+
version = { workspace = true }
|
|
4
|
+
edition = { workspace = true }
|
|
5
|
+
description = "A Ruby binding for Lindera."
|
|
6
|
+
documentation = "https://docs.rs/lindera-ruby"
|
|
7
|
+
homepage = { workspace = true }
|
|
8
|
+
repository = { workspace = true }
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
keywords = ["morphological", "analysis", "library", "ruby", "binding"]
|
|
11
|
+
categories = { workspace = true }
|
|
12
|
+
license = { workspace = true }
|
|
13
|
+
|
|
14
|
+
[lib]
|
|
15
|
+
name = "lindera_ruby"
|
|
16
|
+
crate-type = ["cdylib", "lib"]
|
|
17
|
+
|
|
18
|
+
[features]
|
|
19
|
+
embed-ipadic = ["lindera/embed-ipadic"] # Include Japanese dictionary (IPADIC)
|
|
20
|
+
embed-ipadic-neologd = [
|
|
21
|
+
"lindera/embed-ipadic-neologd",
|
|
22
|
+
] # Include Japanese dictionary (IPADIC NEologd)
|
|
23
|
+
embed-unidic = ["lindera/embed-unidic"] # Include Japanese dictionary (UniDic)
|
|
24
|
+
embed-ko-dic = ["lindera/embed-ko-dic"] # Include Korean dictionary (ko-dic)
|
|
25
|
+
embed-cc-cedict = [
|
|
26
|
+
"lindera/embed-cc-cedict",
|
|
27
|
+
] # Include Chinese dictionary (CC-CEDICT)
|
|
28
|
+
embed-jieba = ["lindera/embed-jieba"] # Include Chinese dictionary (Jieba)
|
|
29
|
+
embed-cjk = [
|
|
30
|
+
"lindera/embed-cjk",
|
|
31
|
+
] # Embed CJK dictionaries (IPADIC, ko-dic, Jieba)
|
|
32
|
+
train = ["lindera/train"] # Enable training functionality
|
|
33
|
+
default = ["train"] # No directories included
|
|
34
|
+
|
|
35
|
+
[dependencies]
|
|
36
|
+
lindera = { workspace = true }
|
|
37
|
+
magnus = { version = "0.8.2", features = ["rb-sys"] }
|
|
38
|
+
num_cpus = { workspace = true }
|
|
39
|
+
serde = { workspace = true, features = ["derive"] }
|
|
40
|
+
serde_json = { workspace = true }
|
data/README.md
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# lindera-ruby
|
|
2
|
+
|
|
3
|
+
Ruby binding for [Lindera](https://github.com/lindera/lindera), a morphological analysis engine for CJK text.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
lindera-ruby provides a Ruby interface to the Lindera morphological analysis engine, supporting Japanese, Korean, and Chinese text analysis.
|
|
8
|
+
|
|
9
|
+
- **Multi-language Support**: Japanese (IPADIC, IPADIC-NEologd, UniDic), Korean (ko-dic), Chinese (CC-CEDICT, Jieba)
|
|
10
|
+
- **Character Filters**: Text preprocessing with mapping, regex, Unicode normalization, and Japanese iteration mark handling
|
|
11
|
+
- **Token Filters**: Post-processing filters including lowercase, length filtering, stop words, and Japanese-specific filters
|
|
12
|
+
- **Flexible Configuration**: Configurable tokenization modes and penalty settings
|
|
13
|
+
- **Metadata Support**: Complete dictionary schema and metadata management
|
|
14
|
+
- **Training & Export** (optional): Train custom morphological analysis models from corpus data
|
|
15
|
+
|
|
16
|
+
## Requirements
|
|
17
|
+
|
|
18
|
+
- Ruby >= 3.1
|
|
19
|
+
- Rust >= 1.85
|
|
20
|
+
|
|
21
|
+
## Dictionary
|
|
22
|
+
|
|
23
|
+
Pre-built dictionaries are available from [GitHub Releases](https://github.com/lindera/lindera/releases).
|
|
24
|
+
Download a dictionary archive (e.g. `lindera-ipadic-*.zip`) and extract it to a local path.
|
|
25
|
+
|
|
26
|
+
## Install
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
cd lindera-ruby
|
|
30
|
+
bundle install
|
|
31
|
+
bundle exec rake compile
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Usage
|
|
35
|
+
|
|
36
|
+
```ruby
|
|
37
|
+
require "lindera"
|
|
38
|
+
|
|
39
|
+
# Load dictionary from a local path (download from GitHub Releases)
|
|
40
|
+
dictionary = Lindera.load_dictionary("/path/to/ipadic")
|
|
41
|
+
|
|
42
|
+
# Create a tokenizer
|
|
43
|
+
tokenizer = Lindera::Tokenizer.new(dictionary, "normal", nil)
|
|
44
|
+
|
|
45
|
+
# Tokenize text
|
|
46
|
+
tokens = tokenizer.tokenize("関西国際空港")
|
|
47
|
+
tokens.each do |token|
|
|
48
|
+
puts "#{token.surface}: #{token.details&.join(', ')}"
|
|
49
|
+
end
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### Using TokenizerBuilder
|
|
53
|
+
|
|
54
|
+
```ruby
|
|
55
|
+
require "lindera"
|
|
56
|
+
|
|
57
|
+
builder = Lindera::TokenizerBuilder.new
|
|
58
|
+
builder.set_mode("normal")
|
|
59
|
+
builder.set_dictionary("/path/to/ipadic")
|
|
60
|
+
|
|
61
|
+
# Add filters
|
|
62
|
+
builder.append_character_filter("unicode_normalize", { "kind" => "nfkc" })
|
|
63
|
+
builder.append_token_filter("lowercase", nil)
|
|
64
|
+
|
|
65
|
+
tokenizer = builder.build
|
|
66
|
+
tokens = tokenizer.tokenize("テスト")
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Test
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
bundle exec rake compile
|
|
73
|
+
bundle exec rake test
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## License
|
|
77
|
+
|
|
78
|
+
MIT
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
require "rb_sys/mkmf"
|
|
2
|
+
|
|
3
|
+
create_rust_makefile("lindera/lindera_ruby") do |r|
|
|
4
|
+
# Pass embed features via LINDERA_FEATURES environment variable
|
|
5
|
+
# e.g., LINDERA_FEATURES="embed-ipadic" bundle exec rake compile
|
|
6
|
+
features = ENV.fetch("LINDERA_FEATURES", "").split(",").map(&:strip).reject(&:empty?)
|
|
7
|
+
r.features = features unless features.empty?
|
|
8
|
+
end
|
data/lib/lindera.rb
ADDED
metadata
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: lindera
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 3.0.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Lindera contributors
|
|
8
|
+
bindir: bin
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
|
+
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: rb_sys
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - "~>"
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: '0.9'
|
|
19
|
+
type: :runtime
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - "~>"
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: '0.9'
|
|
26
|
+
- !ruby/object:Gem::Dependency
|
|
27
|
+
name: minitest
|
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
|
29
|
+
requirements:
|
|
30
|
+
- - "~>"
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: '5.0'
|
|
33
|
+
type: :development
|
|
34
|
+
prerelease: false
|
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - "~>"
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: '5.0'
|
|
40
|
+
- !ruby/object:Gem::Dependency
|
|
41
|
+
name: rake
|
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
|
43
|
+
requirements:
|
|
44
|
+
- - "~>"
|
|
45
|
+
- !ruby/object:Gem::Version
|
|
46
|
+
version: '13.0'
|
|
47
|
+
type: :development
|
|
48
|
+
prerelease: false
|
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - "~>"
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
version: '13.0'
|
|
54
|
+
- !ruby/object:Gem::Dependency
|
|
55
|
+
name: rake-compiler
|
|
56
|
+
requirement: !ruby/object:Gem::Requirement
|
|
57
|
+
requirements:
|
|
58
|
+
- - "~>"
|
|
59
|
+
- !ruby/object:Gem::Version
|
|
60
|
+
version: '1.2'
|
|
61
|
+
type: :development
|
|
62
|
+
prerelease: false
|
|
63
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
64
|
+
requirements:
|
|
65
|
+
- - "~>"
|
|
66
|
+
- !ruby/object:Gem::Version
|
|
67
|
+
version: '1.2'
|
|
68
|
+
description: Ruby bindings for Lindera, a morphological analysis library for CJK text
|
|
69
|
+
(Japanese, Korean, Chinese).
|
|
70
|
+
executables: []
|
|
71
|
+
extensions:
|
|
72
|
+
- ext/lindera_ruby/extconf.rb
|
|
73
|
+
extra_rdoc_files: []
|
|
74
|
+
files:
|
|
75
|
+
- Cargo.toml
|
|
76
|
+
- README.md
|
|
77
|
+
- ext/lindera_ruby/extconf.rb
|
|
78
|
+
- lib/lindera.rb
|
|
79
|
+
homepage: https://github.com/lindera/lindera
|
|
80
|
+
licenses:
|
|
81
|
+
- MIT
|
|
82
|
+
metadata: {}
|
|
83
|
+
rdoc_options: []
|
|
84
|
+
require_paths:
|
|
85
|
+
- lib
|
|
86
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
87
|
+
requirements:
|
|
88
|
+
- - ">="
|
|
89
|
+
- !ruby/object:Gem::Version
|
|
90
|
+
version: '3.1'
|
|
91
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
92
|
+
requirements:
|
|
93
|
+
- - ">="
|
|
94
|
+
- !ruby/object:Gem::Version
|
|
95
|
+
version: '0'
|
|
96
|
+
requirements: []
|
|
97
|
+
rubygems_version: 4.0.6
|
|
98
|
+
specification_version: 4
|
|
99
|
+
summary: Ruby bindings for Lindera morphological analysis engine
|
|
100
|
+
test_files: []
|