license_matcher 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6b710a76802bc5254b2d0d9dd3c36376c613b7e5
4
- data.tar.gz: fef8fe16e4028ed1c49710956dceb4ac13acee9b
3
+ metadata.gz: 75f4cc4338e87a4244875d5172770867b63588cb
4
+ data.tar.gz: 3bb514e6e029140949ed3c78c1b32a1a04333a04
5
5
  SHA512:
6
- metadata.gz: ee54a1ae1b3258f9bc474a5a05c7221281be666976f6e1d547bb1673c4fa57507d0006a21b236d9ce02a55fa6e9e1ac9fe646a6c96d53c2d7255ab59d5e2c821
7
- data.tar.gz: adb94097dea0e79e19ac4b2cf9285d42e190902f2d8f798e5eb7239d094e20ca12193efb7342ad9ee4065b2e5f2bc48ccf3cacc318419699d7a7f2b528fb6fab
6
+ metadata.gz: 1b99befeaf51c94e2f85acdc991abd02486de41ea3f7506a2e7a6402f08e4c0765b322cb480db98703da01ef286bf9751bc6fc458356e1c93c4142f611a26457
7
+ data.tar.gz: 67b3d8e0265f91ebd9d1818983dfea906f543dffcc08d41d3cb50aeb66d56fe76714409085e741513a36229cbaefef388e0ffe691e20e06ec558796eef6f4ac0
data/Cargo.lock CHANGED
@@ -1,8 +1,8 @@
1
1
  [root]
2
2
  name = "license_matcher"
3
- version = "0.1.0"
3
+ version = "0.2.0"
4
4
  dependencies = [
5
- "fosslim 0.0.2-alpha (registry+https://github.com/rust-lang/crates.io-index)",
5
+ "fosslim 0.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
6
6
  "helix 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)",
7
7
  ]
8
8
 
@@ -28,10 +28,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
28
28
 
29
29
  [[package]]
30
30
  name = "fosslim"
31
- version = "0.0.2-alpha"
31
+ version = "0.0.2"
32
32
  source = "registry+https://github.com/rust-lang/crates.io-index"
33
33
  dependencies = [
34
34
  "rmp-serde 0.13.7 (registry+https://github.com/rust-lang/crates.io-index)",
35
+ "seahash 3.0.5 (registry+https://github.com/rust-lang/crates.io-index)",
35
36
  "serde 1.0.14 (registry+https://github.com/rust-lang/crates.io-index)",
36
37
  "serde_derive 1.0.14 (registry+https://github.com/rust-lang/crates.io-index)",
37
38
  "serde_json 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -95,6 +96,11 @@ dependencies = [
95
96
  "serde 1.0.14 (registry+https://github.com/rust-lang/crates.io-index)",
96
97
  ]
97
98
 
99
+ [[package]]
100
+ name = "seahash"
101
+ version = "3.0.5"
102
+ source = "registry+https://github.com/rust-lang/crates.io-index"
103
+
98
104
  [[package]]
99
105
  name = "serde"
100
106
  version = "1.0.14"
@@ -158,7 +164,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
158
164
  "checksum cslice 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "0f8cb7306107e4b10e64994de6d3274bd08996a7c1322a27b86482392f96be0a"
159
165
  "checksum cstr-macro 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "db53fddba18cdd35477a7213a3ef6acfbfa333c31b42ce019e544c4a1420a06f"
160
166
  "checksum dtoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "09c3753c3db574d215cba4ea76018483895d7bff25a31b49ba45db21c48e50ab"
161
- "checksum fosslim 0.0.2-alpha (registry+https://github.com/rust-lang/crates.io-index)" = "9c1c5f5bce13b2a8bec5cc64f1f44b8b114ca2e0b55f53feaa074634a708f40a"
167
+ "checksum fosslim 0.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "3024a8d550d9125c111b24ea5f36f8804e9b3b4a15143b2fc1a86f663edeb749"
162
168
  "checksum helix 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)" = "e131c0e8557e78f4d992b87e97cec90f8503c4b5f3af24e1183e4822736e8079"
163
169
  "checksum itoa 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ac17257442c2ed77dbc9fd555cf83c58b0c7f7d0e8f2ae08c0ac05c72842e1f6"
164
170
  "checksum libc 0.2.30 (registry+https://github.com/rust-lang/crates.io-index)" = "2370ca07ec338939e356443dac2296f581453c35fe1e3a3ed06023c49435f915"
@@ -167,6 +173,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
167
173
  "checksum quote 0.3.15 (registry+https://github.com/rust-lang/crates.io-index)" = "7a6e920b65c65f10b2ae65c831a81a073a89edd28c7cce89475bff467ab4167a"
168
174
  "checksum rmp 0.8.7 (registry+https://github.com/rust-lang/crates.io-index)" = "a3d45d7afc9b132b34a2479648863aa95c5c88e98b32285326a6ebadc80ec5c9"
169
175
  "checksum rmp-serde 0.13.7 (registry+https://github.com/rust-lang/crates.io-index)" = "011e1d58446e9fa3af7cdc1fb91295b10621d3ac4cb3a85cc86385ee9ca50cd3"
176
+ "checksum seahash 3.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "e048636bed25842fcdc36e5ad1ec6295b72d4b5b8a4b759b64915a4ce2b9d09d"
170
177
  "checksum serde 1.0.14 (registry+https://github.com/rust-lang/crates.io-index)" = "bcb6a7637a47663ee073391a139ed07851f27ed2532c2abc88c6bf27a16cdf34"
171
178
  "checksum serde_derive 1.0.14 (registry+https://github.com/rust-lang/crates.io-index)" = "812ff66056fd9a9a5b7c119714243b0862cf98340e7d4b5ee05a932c40d5ea6c"
172
179
  "checksum serde_derive_internals 0.16.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bd381f6d01a6616cdba8530492d453b7761b456ba974e98768a18cad2cd76f58"
data/Cargo.toml CHANGED
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "license_matcher"
3
- version = "0.1.0"
3
+ version = "0.2.0"
4
4
  authors = ["Timo Sulg <timgluz@gmail.com>"]
5
5
 
6
6
  [lib]
@@ -8,4 +8,4 @@ crate-type = ["rlib", "cdylib"]
8
8
 
9
9
  [dependencies]
10
10
  helix = "^0.6.0"
11
- fosslim = "0.0.2-alpha"
11
+ fosslim = "0.0.2"
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- license_matcher (0.1.0.pre.alpha)
4
+ license_matcher (0.2.0.pre.alpha)
5
5
  helix_runtime (~> 0.6.0)
6
6
  narray (~> 0.6.1.2)
7
7
  nokogiri (~> 1.8.0)
data/README.md CHANGED
@@ -38,14 +38,14 @@ require 'license_matcher'
38
38
  curl -O https://github.com/Fosslim/license_matcher/blob/master/data/index.msgpack
39
39
 
40
40
  # or build index from the SPDX data
41
- LicenseMatcher::TFRustMatcher.build_index( "data/licenses", "data/index.msgpack")
41
+ LicenseMatcher::IndexBuilder.build_index( "data/licenses", "data/index.msgpack")
42
42
 
43
43
  # match license text
44
44
  txt = File.read("fixtures/files/mit.txt");
45
45
 
46
46
  lm = LicenseMatcher::TFRubyMatcher.new("data/index.msgpack")
47
- lm.match_text(txt, 0.9)
48
-
47
+ m = lm.match_text(txt, 0.9)
48
+ p "spdx id: #{m.get_label()}, confidence: #{m.get_score()}"
49
49
 
50
50
  ```
51
51
 
@@ -91,29 +91,40 @@ txt = File.read "fixtures/files/mit.txt"
91
91
  lm2.match_text txt
92
92
  ```
93
93
 
94
+ * **FingerprintMacher** - uses hashes of 5-word-ngrams to build fingerprints of the license files;
95
+
96
+ ```
97
+ lm3 = File.read "fixtures/files/mit.txt"
98
+ lm3.match_text txt
99
+ ```
100
+
94
101
  ## Benchmarks
95
102
 
96
- * initialization, Ruby version 1times, Rust version 1000x
103
+ * initialization 1x
97
104
 
98
105
  ```
99
- user system total real
100
- TFRubyMatcher: 12.850000 0.180000 13.030000 ( 13.210955)
101
- TFRustMatcher: 26.260000 9.400000 35.660000 ( 38.264632)
106
+ user system total real
107
+ TFRubyMatcher: 12.970000 0.170000 13.140000 ( 13.361568)
108
+ TFRustMatcher: 0.030000 0.010000 0.040000 ( 0.033793)
109
+ FingerprintMatcher: 0.340000 0.010000 0.350000 ( 0.368786)
102
110
  ```
111
+
103
112
  * matching preprocessed short [MIT](https://raw.githubusercontent.com/Fosslim/license_matcher/master/data/spdx_licenses/plain/MIT) text 1000x times
104
113
 
105
114
  ```
106
- user system total real
107
- TFRubyMatcher:102.410000 12.180000 114.590000 (116.308119)
108
- TFRustMatcher: 7.170000 0.040000 7.210000 ( 7.266000)
115
+ user system total real
116
+ TFRubyMatcher:102.380000 6.730000 109.110000 (113.526434)
117
+ TFRustMatcher: 7.920000 0.100000 8.020000 ( 8.248314)
118
+ FingerMatcher: 4.750000 0.060000 4.810000 ( 5.187512)
109
119
  ```
110
120
 
111
121
  * matching preprocessed long [AGPL-3.0](https://raw.githubusercontent.com/Fosslim/license_matcher/master/data/spdx_licenses/plain/AGPL-3.0) text 1000x times
112
122
 
113
123
  ```
114
124
  user system total real
115
- TFRubyMatcher:242.450000 21.960000 264.410000 (276.417704)
116
- TFRustMatcher: 9.340000 0.070000 9.410000 ( 9.478597)
125
+ TFRubyMatcher:217.270000 9.770000 227.040000 (232.190339)
126
+ TFRustMatcher: 9.330000 0.120000 9.450000 ( 9.654545)
127
+ FingerMatcher: 23.650000 0.250000 23.900000 ( 24.311123)
117
128
  ```
118
129
 
119
130
  ## Development
@@ -23,6 +23,9 @@ module LicenseMatcher
23
23
  true
24
24
  end
25
25
 
26
+ # matches given text with SPDX licenses and returns Match object
27
+ # returns:
28
+ # match - Match {label: String, score: float}
26
29
  def match_text(text, min_confidence = DEFAULT_MIN_CONFIDENCE, is_processed_text = false)
27
30
  return [] if text.to_s.empty?
28
31
 
@@ -42,9 +45,9 @@ module LicenseMatcher
42
45
  best_match = @model.documents[doc_id].id
43
46
 
44
47
  if best_score.to_f > min_confidence
45
- best_match
48
+ Match.new(best_match, best_score)
46
49
  else
47
- ""
50
+ Match.new("", 0.0)
48
51
  end
49
52
  end
50
53
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: license_matcher
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Timo Sulg
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2017-09-19 00:00:00.000000000 Z
12
+ date: 2017-09-27 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: helix_runtime
@@ -148,7 +148,8 @@ files:
148
148
  - lib/license_matcher/url_matcher.rb
149
149
  - lib/tasks/helix_runtime.rake
150
150
  homepage: https://www.github.com/fosslim
151
- licenses: []
151
+ licenses:
152
+ - MIT
152
153
  metadata: {}
153
154
  post_install_message:
154
155
  rdoc_options: []