comparateur 1.0.2 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +50 -9
- data/examples/a.rb +48 -0
- data/lib/comparateur.rb +33 -10
- data/lib/comparateur/version.rb +1 -1
- metadata +3 -3
- data/a.rb +0 -11
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3396baff4b80834299a15ffabeb73748ef16c6f5
|
4
|
+
data.tar.gz: 9d1fca09a2658a2c3ac0de11f29846040744d912
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 13e9616ca619fd82cdbfe5f1625663a0da7fd9a72bf1a1cbf9f70ff18e481773b6963e3cdafa5c3399decca0aa4bacfae47f0decdc2af652ab45dbc77b3163df
|
7
|
+
data.tar.gz: df66d064b6d2446d47225c381c64a37e395196fedba86e0b6af0e4941074e5acca107369ba7a5e1a497c13b7594f65546c75153f8f7c894dc932ed04c5ef663c
|
data/README.md
CHANGED
@@ -1,6 +1,17 @@
|
|
1
1
|
# Comparateur
|
2
2
|
|
3
|
-
|
3
|
+
[](http://badge.fury.io/rb/comparateur) [](https://rubygems.org/gems/comparateur)
|
4
|
+
|
5
|
+
Calculate the structural similarity between two HTML documents.
|
6
|
+
|
7
|
+
**How it works**
|
8
|
+
It serializes strings, Nokogiri::HTML objects and URLs to arrays containing node's tag names and finds the longest common sequence between two serialized arrays.
|
9
|
+
|
10
|
+
The similarity is measured with the formula:
|
11
|
+
`2 * length(LCS Array) / (length(TreeA) + length(treeB))`
|
12
|
+
|
13
|
+
**How is done**
|
14
|
+
Classes are about objects and Modules are about functions. That's why you have to create a class and `include` or `extend` it with `Comparateur` and use it as you like. This implementation also let you built your own cache system.
|
4
15
|
|
5
16
|
## Installation
|
6
17
|
|
@@ -21,18 +32,48 @@ Or install it yourself as:
|
|
21
32
|
```ruby
|
22
33
|
require 'comparateur'
|
23
34
|
|
24
|
-
|
35
|
+
class LeComparateur
|
36
|
+
extend Comparateur
|
37
|
+
end
|
25
38
|
|
26
|
-
|
27
|
-
|
39
|
+
google_url = "http://google.com"
|
40
|
+
duckduck_url = "https://duckduckgo.com"
|
28
41
|
|
29
|
-
|
30
|
-
d = a
|
31
|
-
|
32
|
-
p exp.calculate_similarity(a, b) * 100 # in %
|
33
|
-
p exp.calculate_similarity(c, d)
|
42
|
+
LeComparateur.compare_urls(google_url, duckduck_url) # 0.3815789473684211
|
34
43
|
```
|
35
44
|
|
45
|
+
Example of usage [here](https://raw.githubusercontent.com/radubogdan/ruby-comparateur/master/examples/a.rb)
|
46
|
+
|
47
|
+
## Methods
|
48
|
+
|
49
|
+
`serialize_nokogiri_html(obj1)`
|
50
|
+
- `obj1`: Nokogiri::HTML object.
|
51
|
+
- `return`: Array containing node's tag names.
|
52
|
+
|
53
|
+
`serialize_url(url)`
|
54
|
+
- `url`: URL of the website.
|
55
|
+
- `return`: Array containing node's tag names.
|
56
|
+
|
57
|
+
`serialize_content(str)`
|
58
|
+
- `str`: String containing the html.
|
59
|
+
- `return`: Array containing node's tag names.
|
60
|
+
|
61
|
+
`compare_nokogiri_html(nok1, nok2)`
|
62
|
+
- `nok1, nok2`: Nokogiri::HTML objects.
|
63
|
+
- `return`: Score (0-1).
|
64
|
+
|
65
|
+
`compare_urls(url1, url2)`
|
66
|
+
- `url1, url2`: URL of two different websites.
|
67
|
+
- `return`: Score (0-1).
|
68
|
+
|
69
|
+
`compare_content(str1, str2)`
|
70
|
+
- `str1, str2`: First and second string which contain the html.
|
71
|
+
- `return`: Score (0-1).
|
72
|
+
|
73
|
+
`lcs(arr1, arr2)`
|
74
|
+
- `arr1, arr2`: First and second array which contain the node's tag names.
|
75
|
+
- `return`: Score (0-1)
|
76
|
+
|
36
77
|
## Contributing
|
37
78
|
|
38
79
|
1. Fork it ( https://github.com/radubogdan/ruby-comparateur/fork )
|
data/examples/a.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
load "./lib/comparateur.rb" # use require comparateur
|
2
|
+
|
3
|
+
# Create your own class and add
|
4
|
+
# Comparateur's functions as class methods
|
5
|
+
class LeComparateur
|
6
|
+
extend Comparateur
|
7
|
+
end
|
8
|
+
|
9
|
+
# Create your class and include all
|
10
|
+
# functions as instance methods
|
11
|
+
class MyClass
|
12
|
+
include Comparateur
|
13
|
+
end
|
14
|
+
|
15
|
+
# All examples use these
|
16
|
+
a = "<html><body><ul><li><ul><li></li></ul></li></ul></body></html"
|
17
|
+
b = "<html><body><ul><li></li></ul></body></html"
|
18
|
+
google_url = "http://google.com"
|
19
|
+
duckduck_url = "https://duckduckgo.com"
|
20
|
+
a_nok = Nokogiri::HTML(a)
|
21
|
+
b_nok = Nokogiri::HTML(b)
|
22
|
+
a_arr = %w(html body ul li ul li)
|
23
|
+
b_arr = %w(html body ul li)
|
24
|
+
|
25
|
+
# Serialize Nokogiri::HTML objects
|
26
|
+
# return: Array of HTML nodes
|
27
|
+
LeComparateur.serialize_nokogiri_html(a_nok) # ["html", "body", "ul", "li", "ul", "li"]
|
28
|
+
|
29
|
+
# Serialize URL
|
30
|
+
# return: Array of HTML nodes
|
31
|
+
LeComparateur.serialize_url(google_url) # Long array of nodes used in google.com
|
32
|
+
|
33
|
+
# Serialize Content
|
34
|
+
# return: Array of HTML nodes
|
35
|
+
LeComparateur.serialize_content(a) # ["html", "body", "ul", "li", "ul", "li"]
|
36
|
+
LeComparateur.serialize_content(b) # ["html", "body", "ul", "li"]
|
37
|
+
|
38
|
+
# Compare Nokogiri::HTML objects
|
39
|
+
LeComparateur.compare_nokogiri_html(a_nok, b_nok)
|
40
|
+
|
41
|
+
# Compare URLs
|
42
|
+
LeComparateur.compare_urls(google_url, duckduck_url) # 0.3815789473684211
|
43
|
+
|
44
|
+
# Compare content
|
45
|
+
LeComparateur.compare_content(a, b)
|
46
|
+
|
47
|
+
# Directly compare arrays of nodes
|
48
|
+
LeComparateur.lcs(a_arr, b_arr)
|
data/lib/comparateur.rb
CHANGED
@@ -1,20 +1,43 @@
|
|
1
1
|
require "comparateur/version"
|
2
2
|
require "nokogiri"
|
3
3
|
require "diff-lcs"
|
4
|
+
require "open-uri"
|
4
5
|
|
5
|
-
module
|
6
|
-
class Comparateur
|
6
|
+
module Comparateur
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
8
|
+
def serialize_nokogiri_html nokogiri_html
|
9
|
+
nokogiri_html.search('*').map(&:name)
|
10
|
+
end
|
11
|
+
|
12
|
+
def serialize_url url
|
13
|
+
Nokogiri::HTML(open(url)).search('*').map(&:name)
|
14
|
+
end
|
15
|
+
|
16
|
+
def serialize_content str
|
17
|
+
Nokogiri::HTML(str).search('*').map(&:name)
|
18
|
+
end
|
11
19
|
|
12
|
-
|
13
|
-
|
14
|
-
|
20
|
+
def compare_nokogiri_html nok1, nok2
|
21
|
+
s1 = serialize_nokogiri_html(nok1)
|
22
|
+
s2 = serialize_nokogiri_html(nok2)
|
23
|
+
lcs(s1, s2)
|
24
|
+
end
|
15
25
|
|
16
|
-
|
17
|
-
|
26
|
+
def compare_urls url1, url2
|
27
|
+
s1 = serialize_url(url1)
|
28
|
+
s2 = serialize_url(url2)
|
29
|
+
lcs(s1, s2)
|
30
|
+
end
|
18
31
|
|
32
|
+
def compare_content str1, str2
|
33
|
+
s1 = serialize_content(str1)
|
34
|
+
s2 = serialize_content(str2)
|
35
|
+
lcs(s1, s2)
|
19
36
|
end
|
37
|
+
|
38
|
+
def lcs arr1, arr2
|
39
|
+
lcs = Diff::LCS.LCS(arr1, arr2)
|
40
|
+
return (2.0 * lcs.length.to_f) / (arr1.length.to_f + arr2.length.to_f)
|
41
|
+
end
|
42
|
+
|
20
43
|
end
|
data/lib/comparateur/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: comparateur
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Radu-Bogdan Croitoru
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-12-
|
11
|
+
date: 2014-12-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -78,8 +78,8 @@ files:
|
|
78
78
|
- LICENSE.txt
|
79
79
|
- README.md
|
80
80
|
- Rakefile
|
81
|
-
- a.rb
|
82
81
|
- comparateur.gemspec
|
82
|
+
- examples/a.rb
|
83
83
|
- lib/comparateur.rb
|
84
84
|
- lib/comparateur/version.rb
|
85
85
|
homepage: https://github.com/radubogdan/ruby-comparateur
|
data/a.rb
DELETED
@@ -1,11 +0,0 @@
|
|
1
|
-
load "lib/comparateur.rb"
|
2
|
-
|
3
|
-
exp = Le::Comparateur.new
|
4
|
-
|
5
|
-
#site1 = Nokogiri::HTML("<html><body><h1></h1><h2></h2></body></html>")
|
6
|
-
|
7
|
-
|
8
|
-
site1 = ["html", "body"]
|
9
|
-
site2 = Nokogiri::HTML("<html><body></body></html>")
|
10
|
-
|
11
|
-
p exp.calculate_similarity(site1, site2)
|