license_matcher 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6b710a76802bc5254b2d0d9dd3c36376c613b7e5
4
- data.tar.gz: fef8fe16e4028ed1c49710956dceb4ac13acee9b
3
+ metadata.gz: 75f4cc4338e87a4244875d5172770867b63588cb
4
+ data.tar.gz: 3bb514e6e029140949ed3c78c1b32a1a04333a04
5
5
  SHA512:
6
- metadata.gz: ee54a1ae1b3258f9bc474a5a05c7221281be666976f6e1d547bb1673c4fa57507d0006a21b236d9ce02a55fa6e9e1ac9fe646a6c96d53c2d7255ab59d5e2c821
7
- data.tar.gz: adb94097dea0e79e19ac4b2cf9285d42e190902f2d8f798e5eb7239d094e20ca12193efb7342ad9ee4065b2e5f2bc48ccf3cacc318419699d7a7f2b528fb6fab
6
+ metadata.gz: 1b99befeaf51c94e2f85acdc991abd02486de41ea3f7506a2e7a6402f08e4c0765b322cb480db98703da01ef286bf9751bc6fc458356e1c93c4142f611a26457
7
+ data.tar.gz: 67b3d8e0265f91ebd9d1818983dfea906f543dffcc08d41d3cb50aeb66d56fe76714409085e741513a36229cbaefef388e0ffe691e20e06ec558796eef6f4ac0
data/Cargo.lock CHANGED
@@ -1,8 +1,8 @@
1
1
  [root]
2
2
  name = "license_matcher"
3
- version = "0.1.0"
3
+ version = "0.2.0"
4
4
  dependencies = [
5
- "fosslim 0.0.2-alpha (registry+https://github.com/rust-lang/crates.io-index)",
5
+ "fosslim 0.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
6
6
  "helix 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)",
7
7
  ]
8
8
 
@@ -28,10 +28,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
28
28
 
29
29
  [[package]]
30
30
  name = "fosslim"
31
- version = "0.0.2-alpha"
31
+ version = "0.0.2"
32
32
  source = "registry+https://github.com/rust-lang/crates.io-index"
33
33
  dependencies = [
34
34
  "rmp-serde 0.13.7 (registry+https://github.com/rust-lang/crates.io-index)",
35
+ "seahash 3.0.5 (registry+https://github.com/rust-lang/crates.io-index)",
35
36
  "serde 1.0.14 (registry+https://github.com/rust-lang/crates.io-index)",
36
37
  "serde_derive 1.0.14 (registry+https://github.com/rust-lang/crates.io-index)",
37
38
  "serde_json 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -95,6 +96,11 @@ dependencies = [
95
96
  "serde 1.0.14 (registry+https://github.com/rust-lang/crates.io-index)",
96
97
  ]
97
98
 
99
+ [[package]]
100
+ name = "seahash"
101
+ version = "3.0.5"
102
+ source = "registry+https://github.com/rust-lang/crates.io-index"
103
+
98
104
  [[package]]
99
105
  name = "serde"
100
106
  version = "1.0.14"
@@ -158,7 +164,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
158
164
  "checksum cslice 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "0f8cb7306107e4b10e64994de6d3274bd08996a7c1322a27b86482392f96be0a"
159
165
  "checksum cstr-macro 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "db53fddba18cdd35477a7213a3ef6acfbfa333c31b42ce019e544c4a1420a06f"
160
166
  "checksum dtoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "09c3753c3db574d215cba4ea76018483895d7bff25a31b49ba45db21c48e50ab"
161
- "checksum fosslim 0.0.2-alpha (registry+https://github.com/rust-lang/crates.io-index)" = "9c1c5f5bce13b2a8bec5cc64f1f44b8b114ca2e0b55f53feaa074634a708f40a"
167
+ "checksum fosslim 0.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "3024a8d550d9125c111b24ea5f36f8804e9b3b4a15143b2fc1a86f663edeb749"
162
168
  "checksum helix 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)" = "e131c0e8557e78f4d992b87e97cec90f8503c4b5f3af24e1183e4822736e8079"
163
169
  "checksum itoa 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ac17257442c2ed77dbc9fd555cf83c58b0c7f7d0e8f2ae08c0ac05c72842e1f6"
164
170
  "checksum libc 0.2.30 (registry+https://github.com/rust-lang/crates.io-index)" = "2370ca07ec338939e356443dac2296f581453c35fe1e3a3ed06023c49435f915"
@@ -167,6 +173,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
167
173
  "checksum quote 0.3.15 (registry+https://github.com/rust-lang/crates.io-index)" = "7a6e920b65c65f10b2ae65c831a81a073a89edd28c7cce89475bff467ab4167a"
168
174
  "checksum rmp 0.8.7 (registry+https://github.com/rust-lang/crates.io-index)" = "a3d45d7afc9b132b34a2479648863aa95c5c88e98b32285326a6ebadc80ec5c9"
169
175
  "checksum rmp-serde 0.13.7 (registry+https://github.com/rust-lang/crates.io-index)" = "011e1d58446e9fa3af7cdc1fb91295b10621d3ac4cb3a85cc86385ee9ca50cd3"
176
+ "checksum seahash 3.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "e048636bed25842fcdc36e5ad1ec6295b72d4b5b8a4b759b64915a4ce2b9d09d"
170
177
  "checksum serde 1.0.14 (registry+https://github.com/rust-lang/crates.io-index)" = "bcb6a7637a47663ee073391a139ed07851f27ed2532c2abc88c6bf27a16cdf34"
171
178
  "checksum serde_derive 1.0.14 (registry+https://github.com/rust-lang/crates.io-index)" = "812ff66056fd9a9a5b7c119714243b0862cf98340e7d4b5ee05a932c40d5ea6c"
172
179
  "checksum serde_derive_internals 0.16.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bd381f6d01a6616cdba8530492d453b7761b456ba974e98768a18cad2cd76f58"
data/Cargo.toml CHANGED
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "license_matcher"
3
- version = "0.1.0"
3
+ version = "0.2.0"
4
4
  authors = ["Timo Sulg <timgluz@gmail.com>"]
5
5
 
6
6
  [lib]
@@ -8,4 +8,4 @@ crate-type = ["rlib", "cdylib"]
8
8
 
9
9
  [dependencies]
10
10
  helix = "^0.6.0"
11
- fosslim = "0.0.2-alpha"
11
+ fosslim = "0.0.2"
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- license_matcher (0.1.0.pre.alpha)
4
+ license_matcher (0.2.0.pre.alpha)
5
5
  helix_runtime (~> 0.6.0)
6
6
  narray (~> 0.6.1.2)
7
7
  nokogiri (~> 1.8.0)
data/README.md CHANGED
@@ -38,14 +38,14 @@ require 'license_matcher'
38
38
  curl -O https://github.com/Fosslim/license_matcher/blob/master/data/index.msgpack
39
39
 
40
40
  # or build index from the SPDX data
41
- LicenseMatcher::TFRustMatcher.build_index( "data/licenses", "data/index.msgpack")
41
+ LicenseMatcher::IndexBuilder.build_index( "data/licenses", "data/index.msgpack")
42
42
 
43
43
  # match license text
44
44
  txt = File.read("fixtures/files/mit.txt");
45
45
 
46
46
  lm = LicenseMatcher::TFRubyMatcher.new("data/index.msgpack")
47
- lm.match_text(txt, 0.9)
48
-
47
+ m = lm.match_text(txt, 0.9)
48
+ p "spdx id: #{m.get_label()}, confidence: #{m.get_score()}"
49
49
 
50
50
  ```
51
51
 
@@ -91,29 +91,40 @@ txt = File.read "fixtures/files/mit.txt"
91
91
  lm2.match_text txt
92
92
  ```
93
93
 
94
+ * **FingerprintMacher** - uses hashes of 5-word-ngrams to build fingerprints of the license files;
95
+
96
+ ```
97
+ lm3 = File.read "fixtures/files/mit.txt"
98
+ lm3.match_text txt
99
+ ```
100
+
94
101
  ## Benchmarks
95
102
 
96
- * initialization, Ruby version 1times, Rust version 1000x
103
+ * initialization 1x
97
104
 
98
105
  ```
99
- user system total real
100
- TFRubyMatcher: 12.850000 0.180000 13.030000 ( 13.210955)
101
- TFRustMatcher: 26.260000 9.400000 35.660000 ( 38.264632)
106
+ user system total real
107
+ TFRubyMatcher: 12.970000 0.170000 13.140000 ( 13.361568)
108
+ TFRustMatcher: 0.030000 0.010000 0.040000 ( 0.033793)
109
+ FingerprintMatcher: 0.340000 0.010000 0.350000 ( 0.368786)
102
110
  ```
111
+
103
112
  * matching preprocessed short [MIT](https://raw.githubusercontent.com/Fosslim/license_matcher/master/data/spdx_licenses/plain/MIT) text 1000x times
104
113
 
105
114
  ```
106
- user system total real
107
- TFRubyMatcher:102.410000 12.180000 114.590000 (116.308119)
108
- TFRustMatcher: 7.170000 0.040000 7.210000 ( 7.266000)
115
+ user system total real
116
+ TFRubyMatcher:102.380000 6.730000 109.110000 (113.526434)
117
+ TFRustMatcher: 7.920000 0.100000 8.020000 ( 8.248314)
118
+ FingerMatcher: 4.750000 0.060000 4.810000 ( 5.187512)
109
119
  ```
110
120
 
111
121
  * matching preprocessed long [AGPL-3.0](https://raw.githubusercontent.com/Fosslim/license_matcher/master/data/spdx_licenses/plain/AGPL-3.0) text 1000x times
112
122
 
113
123
  ```
114
124
  user system total real
115
- TFRubyMatcher:242.450000 21.960000 264.410000 (276.417704)
116
- TFRustMatcher: 9.340000 0.070000 9.410000 ( 9.478597)
125
+ TFRubyMatcher:217.270000 9.770000 227.040000 (232.190339)
126
+ TFRustMatcher: 9.330000 0.120000 9.450000 ( 9.654545)
127
+ FingerMatcher: 23.650000 0.250000 23.900000 ( 24.311123)
117
128
  ```
118
129
 
119
130
  ## Development
@@ -23,6 +23,9 @@ module LicenseMatcher
23
23
  true
24
24
  end
25
25
 
26
+ # matches given text with SPDX licenses and returns Match object
27
+ # returns:
28
+ # match - Match {label: String, score: float}
26
29
  def match_text(text, min_confidence = DEFAULT_MIN_CONFIDENCE, is_processed_text = false)
27
30
  return [] if text.to_s.empty?
28
31
 
@@ -42,9 +45,9 @@ module LicenseMatcher
42
45
  best_match = @model.documents[doc_id].id
43
46
 
44
47
  if best_score.to_f > min_confidence
45
- best_match
48
+ Match.new(best_match, best_score)
46
49
  else
47
- ""
50
+ Match.new("", 0.0)
48
51
  end
49
52
  end
50
53
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: license_matcher
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Timo Sulg
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2017-09-19 00:00:00.000000000 Z
12
+ date: 2017-09-27 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: helix_runtime
@@ -148,7 +148,8 @@ files:
148
148
  - lib/license_matcher/url_matcher.rb
149
149
  - lib/tasks/helix_runtime.rake
150
150
  homepage: https://www.github.com/fosslim
151
- licenses: []
151
+ licenses:
152
+ - MIT
152
153
  metadata: {}
153
154
  post_install_message:
154
155
  rdoc_options: []