license_matcher 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +11 -4
- data/Cargo.toml +2 -2
- data/Gemfile.lock +1 -1
- data/README.md +23 -12
- data/lib/license_matcher/native.bundle +0 -0
- data/lib/license_matcher/tf_ruby_matcher.rb +5 -2
- metadata +4 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 75f4cc4338e87a4244875d5172770867b63588cb
|
|
4
|
+
data.tar.gz: 3bb514e6e029140949ed3c78c1b32a1a04333a04
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 1b99befeaf51c94e2f85acdc991abd02486de41ea3f7506a2e7a6402f08e4c0765b322cb480db98703da01ef286bf9751bc6fc458356e1c93c4142f611a26457
|
|
7
|
+
data.tar.gz: 67b3d8e0265f91ebd9d1818983dfea906f543dffcc08d41d3cb50aeb66d56fe76714409085e741513a36229cbaefef388e0ffe691e20e06ec558796eef6f4ac0
|
data/Cargo.lock
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
[root]
|
|
2
2
|
name = "license_matcher"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.2.0"
|
|
4
4
|
dependencies = [
|
|
5
|
-
"fosslim 0.0.2
|
|
5
|
+
"fosslim 0.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
|
6
6
|
"helix 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
|
7
7
|
]
|
|
8
8
|
|
|
@@ -28,10 +28,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
|
28
28
|
|
|
29
29
|
[[package]]
|
|
30
30
|
name = "fosslim"
|
|
31
|
-
version = "0.0.2
|
|
31
|
+
version = "0.0.2"
|
|
32
32
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
33
33
|
dependencies = [
|
|
34
34
|
"rmp-serde 0.13.7 (registry+https://github.com/rust-lang/crates.io-index)",
|
|
35
|
+
"seahash 3.0.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
|
35
36
|
"serde 1.0.14 (registry+https://github.com/rust-lang/crates.io-index)",
|
|
36
37
|
"serde_derive 1.0.14 (registry+https://github.com/rust-lang/crates.io-index)",
|
|
37
38
|
"serde_json 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
|
@@ -95,6 +96,11 @@ dependencies = [
|
|
|
95
96
|
"serde 1.0.14 (registry+https://github.com/rust-lang/crates.io-index)",
|
|
96
97
|
]
|
|
97
98
|
|
|
99
|
+
[[package]]
|
|
100
|
+
name = "seahash"
|
|
101
|
+
version = "3.0.5"
|
|
102
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
103
|
+
|
|
98
104
|
[[package]]
|
|
99
105
|
name = "serde"
|
|
100
106
|
version = "1.0.14"
|
|
@@ -158,7 +164,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
|
158
164
|
"checksum cslice 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "0f8cb7306107e4b10e64994de6d3274bd08996a7c1322a27b86482392f96be0a"
|
|
159
165
|
"checksum cstr-macro 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "db53fddba18cdd35477a7213a3ef6acfbfa333c31b42ce019e544c4a1420a06f"
|
|
160
166
|
"checksum dtoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "09c3753c3db574d215cba4ea76018483895d7bff25a31b49ba45db21c48e50ab"
|
|
161
|
-
"checksum fosslim 0.0.2
|
|
167
|
+
"checksum fosslim 0.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "3024a8d550d9125c111b24ea5f36f8804e9b3b4a15143b2fc1a86f663edeb749"
|
|
162
168
|
"checksum helix 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)" = "e131c0e8557e78f4d992b87e97cec90f8503c4b5f3af24e1183e4822736e8079"
|
|
163
169
|
"checksum itoa 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ac17257442c2ed77dbc9fd555cf83c58b0c7f7d0e8f2ae08c0ac05c72842e1f6"
|
|
164
170
|
"checksum libc 0.2.30 (registry+https://github.com/rust-lang/crates.io-index)" = "2370ca07ec338939e356443dac2296f581453c35fe1e3a3ed06023c49435f915"
|
|
@@ -167,6 +173,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
|
167
173
|
"checksum quote 0.3.15 (registry+https://github.com/rust-lang/crates.io-index)" = "7a6e920b65c65f10b2ae65c831a81a073a89edd28c7cce89475bff467ab4167a"
|
|
168
174
|
"checksum rmp 0.8.7 (registry+https://github.com/rust-lang/crates.io-index)" = "a3d45d7afc9b132b34a2479648863aa95c5c88e98b32285326a6ebadc80ec5c9"
|
|
169
175
|
"checksum rmp-serde 0.13.7 (registry+https://github.com/rust-lang/crates.io-index)" = "011e1d58446e9fa3af7cdc1fb91295b10621d3ac4cb3a85cc86385ee9ca50cd3"
|
|
176
|
+
"checksum seahash 3.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "e048636bed25842fcdc36e5ad1ec6295b72d4b5b8a4b759b64915a4ce2b9d09d"
|
|
170
177
|
"checksum serde 1.0.14 (registry+https://github.com/rust-lang/crates.io-index)" = "bcb6a7637a47663ee073391a139ed07851f27ed2532c2abc88c6bf27a16cdf34"
|
|
171
178
|
"checksum serde_derive 1.0.14 (registry+https://github.com/rust-lang/crates.io-index)" = "812ff66056fd9a9a5b7c119714243b0862cf98340e7d4b5ee05a932c40d5ea6c"
|
|
172
179
|
"checksum serde_derive_internals 0.16.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bd381f6d01a6616cdba8530492d453b7761b456ba974e98768a18cad2cd76f58"
|
data/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "license_matcher"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.2.0"
|
|
4
4
|
authors = ["Timo Sulg <timgluz@gmail.com>"]
|
|
5
5
|
|
|
6
6
|
[lib]
|
|
@@ -8,4 +8,4 @@ crate-type = ["rlib", "cdylib"]
|
|
|
8
8
|
|
|
9
9
|
[dependencies]
|
|
10
10
|
helix = "^0.6.0"
|
|
11
|
-
fosslim = "0.0.2
|
|
11
|
+
fosslim = "0.0.2"
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
|
@@ -38,14 +38,14 @@ require 'license_matcher'
|
|
|
38
38
|
curl -O https://github.com/Fosslim/license_matcher/blob/master/data/index.msgpack
|
|
39
39
|
|
|
40
40
|
# or build index from the SPDX data
|
|
41
|
-
LicenseMatcher::
|
|
41
|
+
LicenseMatcher::IndexBuilder.build_index( "data/licenses", "data/index.msgpack")
|
|
42
42
|
|
|
43
43
|
# match license text
|
|
44
44
|
txt = File.read("fixtures/files/mit.txt");
|
|
45
45
|
|
|
46
46
|
lm = LicenseMatcher::TFRubyMatcher.new("data/index.msgpack")
|
|
47
|
-
lm.match_text(txt, 0.9)
|
|
48
|
-
|
|
47
|
+
m = lm.match_text(txt, 0.9)
|
|
48
|
+
p "spdx id: #{m.get_label()}, confidence: #{m.get_score()}"
|
|
49
49
|
|
|
50
50
|
```
|
|
51
51
|
|
|
@@ -91,29 +91,40 @@ txt = File.read "fixtures/files/mit.txt"
|
|
|
91
91
|
lm2.match_text txt
|
|
92
92
|
```
|
|
93
93
|
|
|
94
|
+
* **FingerprintMacher** - uses hashes of 5-word-ngrams to build fingerprints of the license files;
|
|
95
|
+
|
|
96
|
+
```
|
|
97
|
+
lm3 = File.read "fixtures/files/mit.txt"
|
|
98
|
+
lm3.match_text txt
|
|
99
|
+
```
|
|
100
|
+
|
|
94
101
|
## Benchmarks
|
|
95
102
|
|
|
96
|
-
* initialization
|
|
103
|
+
* initialization 1x
|
|
97
104
|
|
|
98
105
|
```
|
|
99
|
-
|
|
100
|
-
TFRubyMatcher:
|
|
101
|
-
TFRustMatcher:
|
|
106
|
+
user system total real
|
|
107
|
+
TFRubyMatcher: 12.970000 0.170000 13.140000 ( 13.361568)
|
|
108
|
+
TFRustMatcher: 0.030000 0.010000 0.040000 ( 0.033793)
|
|
109
|
+
FingerprintMatcher: 0.340000 0.010000 0.350000 ( 0.368786)
|
|
102
110
|
```
|
|
111
|
+
|
|
103
112
|
* matching preprocessed short [MIT](https://raw.githubusercontent.com/Fosslim/license_matcher/master/data/spdx_licenses/plain/MIT) text 1000x times
|
|
104
113
|
|
|
105
114
|
```
|
|
106
|
-
|
|
107
|
-
TFRubyMatcher:102.
|
|
108
|
-
TFRustMatcher: 7.
|
|
115
|
+
user system total real
|
|
116
|
+
TFRubyMatcher:102.380000 6.730000 109.110000 (113.526434)
|
|
117
|
+
TFRustMatcher: 7.920000 0.100000 8.020000 ( 8.248314)
|
|
118
|
+
FingerMatcher: 4.750000 0.060000 4.810000 ( 5.187512)
|
|
109
119
|
```
|
|
110
120
|
|
|
111
121
|
* matching preprocessed long [AGPL-3.0](https://raw.githubusercontent.com/Fosslim/license_matcher/master/data/spdx_licenses/plain/AGPL-3.0) text 1000x times
|
|
112
122
|
|
|
113
123
|
```
|
|
114
124
|
user system total real
|
|
115
|
-
TFRubyMatcher:
|
|
116
|
-
TFRustMatcher: 9.
|
|
125
|
+
TFRubyMatcher:217.270000 9.770000 227.040000 (232.190339)
|
|
126
|
+
TFRustMatcher: 9.330000 0.120000 9.450000 ( 9.654545)
|
|
127
|
+
FingerMatcher: 23.650000 0.250000 23.900000 ( 24.311123)
|
|
117
128
|
```
|
|
118
129
|
|
|
119
130
|
## Development
|
|
Binary file
|
|
@@ -23,6 +23,9 @@ module LicenseMatcher
|
|
|
23
23
|
true
|
|
24
24
|
end
|
|
25
25
|
|
|
26
|
+
# matches given text with SPDX licenses and returns Match object
|
|
27
|
+
# returns:
|
|
28
|
+
# match - Match {label: String, score: float}
|
|
26
29
|
def match_text(text, min_confidence = DEFAULT_MIN_CONFIDENCE, is_processed_text = false)
|
|
27
30
|
return [] if text.to_s.empty?
|
|
28
31
|
|
|
@@ -42,9 +45,9 @@ module LicenseMatcher
|
|
|
42
45
|
best_match = @model.documents[doc_id].id
|
|
43
46
|
|
|
44
47
|
if best_score.to_f > min_confidence
|
|
45
|
-
best_match
|
|
48
|
+
Match.new(best_match, best_score)
|
|
46
49
|
else
|
|
47
|
-
""
|
|
50
|
+
Match.new("", 0.0)
|
|
48
51
|
end
|
|
49
52
|
end
|
|
50
53
|
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: license_matcher
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Timo Sulg
|
|
@@ -9,7 +9,7 @@ authors:
|
|
|
9
9
|
autorequire:
|
|
10
10
|
bindir: bin
|
|
11
11
|
cert_chain: []
|
|
12
|
-
date: 2017-09-
|
|
12
|
+
date: 2017-09-27 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
|
14
14
|
- !ruby/object:Gem::Dependency
|
|
15
15
|
name: helix_runtime
|
|
@@ -148,7 +148,8 @@ files:
|
|
|
148
148
|
- lib/license_matcher/url_matcher.rb
|
|
149
149
|
- lib/tasks/helix_runtime.rake
|
|
150
150
|
homepage: https://www.github.com/fosslim
|
|
151
|
-
licenses:
|
|
151
|
+
licenses:
|
|
152
|
+
- MIT
|
|
152
153
|
metadata: {}
|
|
153
154
|
post_install_message:
|
|
154
155
|
rdoc_options: []
|