comparateur 1.0.2 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 29c929d2e8f2a89a4a4a36bcd412819a0feaf171
4
- data.tar.gz: 4287a132f615bb35cff1c4e49da0697387aae91d
3
+ metadata.gz: 3396baff4b80834299a15ffabeb73748ef16c6f5
4
+ data.tar.gz: 9d1fca09a2658a2c3ac0de11f29846040744d912
5
5
  SHA512:
6
- metadata.gz: ad03801f73f44f3e89d95d9591f569f7eb7a82f3e89f3512e6d88d00a3a8b3aaefdd1a4721a1514e583ea38e9300bb183225f766d9380d70541a93c638a14950
7
- data.tar.gz: 883479951d98d6409d6637e05141f8c078c42dc9f766c20a9408bfd80758c2c493ce37f941d754fbca607d0c0d80e52de6bc08b6ba2443efb1982d00d6d1d00d
6
+ metadata.gz: 13e9616ca619fd82cdbfe5f1625663a0da7fd9a72bf1a1cbf9f70ff18e481773b6963e3cdafa5c3399decca0aa4bacfae47f0decdc2af652ab45dbc77b3163df
7
+ data.tar.gz: df66d064b6d2446d47225c381c64a37e395196fedba86e0b6af0e4941074e5acca107369ba7a5e1a497c13b7594f65546c75153f8f7c894dc932ed04c5ef663c
data/README.md CHANGED
@@ -1,6 +1,17 @@
1
1
  # Comparateur
2
2
 
3
- Calculate the structural similarity between two HTML documents
3
+ [![Gem Version](https://badge.fury.io/rb/comparateur.svg)](http://badge.fury.io/rb/comparateur) [![comparateur Downloads](http://www.gemetric.me/images/comparateur.gif)](https://rubygems.org/gems/comparateur)
4
+
5
+ Calculate the structural similarity between two HTML documents.
6
+
7
+ **How it works**
8
+ It serializes strings, Nokogiri::HTML objects and URLs to arrays containing node's tag names and finds the longest common sequence between two serialized arrays.
9
+
10
+ The similarity is measured with the formula:
11
+ `2 * length(LCS Array) / (length(TreeA) + length(treeB))`
12
+
13
+ **How is done**
14
+ Classes are about objects and Modules are about functions. That's why you have to create a class and `include` or `extend` it with `Comparateur` and use it as you like. This implementation also let you built your own cache system.
4
15
 
5
16
  ## Installation
6
17
 
@@ -21,18 +32,48 @@ Or install it yourself as:
21
32
  ```ruby
22
33
  require 'comparateur'
23
34
 
24
- exp = Le::Comparateur.new
35
+ class LeComparateur
36
+ extend Comparateur
37
+ end
25
38
 
26
- a = "<html><body></body></html>"
27
- b = "<html><body><h1></h1></body></html>"
39
+ google_url = "http://google.com"
40
+ duckduck_url = "https://duckduckgo.com"
28
41
 
29
- c = Nokogiri::HTML("<html><body></body></html>")
30
- d = a
31
-
32
- p exp.calculate_similarity(a, b) * 100 # in %
33
- p exp.calculate_similarity(c, d)
42
+ LeComparateur.compare_urls(google_url, duckduck_url) # 0.3815789473684211
34
43
  ```
35
44
 
45
+ Example of usage [here](https://raw.githubusercontent.com/radubogdan/ruby-comparateur/master/examples/a.rb)
46
+
47
+ ## Methods
48
+
49
+ `serialize_nokogiri_html(obj1)`
50
+ - `obj1`: Nokogiri::HTML object.
51
+ - `return`: Array containing node's tag names.
52
+
53
+ `serialize_url(url)`
54
+ - `url`: URL of the website.
55
+ - `return`: Array containing node's tag names.
56
+
57
+ `serialize_content(str)`
58
+ - `str`: String containing the html.
59
+ - `return`: Array containing node's tag names.
60
+
61
+ `compare_nokogiri_html(nok1, nok2)`
62
+ - `nok1, nok2`: Nokogiri::HTML objects.
63
+ - `return`: Score (0-1).
64
+
65
+ `compare_urls(url1, url2)`
66
+ - `url1, url2`: URL of two different websites.
67
+ - `return`: Score (0-1).
68
+
69
+ `compare_content(str1, str2)`
70
+ - `str1, str2`: First and second string which contain the html.
71
+ - `return`: Score (0-1).
72
+
73
+ `lcs(arr1, arr2)`
74
+ - `arr1, arr2`: First and second array which contain the node's tag names.
75
+ - `return`: Score (0-1)
76
+
36
77
  ## Contributing
37
78
 
38
79
  1. Fork it ( https://github.com/radubogdan/ruby-comparateur/fork )
data/examples/a.rb ADDED
@@ -0,0 +1,48 @@
1
+ load "./lib/comparateur.rb" # use require comparateur
2
+
3
+ # Create your own class and add
4
+ # Comparateur's functions as class methods
5
+ class LeComparateur
6
+ extend Comparateur
7
+ end
8
+
9
+ # Create your class and include all
10
+ # functions as instance methods
11
+ class MyClass
12
+ include Comparateur
13
+ end
14
+
15
+ # All examples use these
16
+ a = "<html><body><ul><li><ul><li></li></ul></li></ul></body></html"
17
+ b = "<html><body><ul><li></li></ul></body></html"
18
+ google_url = "http://google.com"
19
+ duckduck_url = "https://duckduckgo.com"
20
+ a_nok = Nokogiri::HTML(a)
21
+ b_nok = Nokogiri::HTML(b)
22
+ a_arr = %w(html body ul li ul li)
23
+ b_arr = %w(html body ul li)
24
+
25
+ # Serialize Nokogiri::HTML objects
26
+ # return: Array of HTML nodes
27
+ LeComparateur.serialize_nokogiri_html(a_nok) # ["html", "body", "ul", "li", "ul", "li"]
28
+
29
+ # Serialize URL
30
+ # return: Array of HTML nodes
31
+ LeComparateur.serialize_url(google_url) # Long array of nodes used in google.com
32
+
33
+ # Serialize Content
34
+ # return: Array of HTML nodes
35
+ LeComparateur.serialize_content(a) # ["html", "body", "ul", "li", "ul", "li"]
36
+ LeComparateur.serialize_content(b) # ["html", "body", "ul", "li"]
37
+
38
+ # Compare Nokogiri::HTML objects
39
+ LeComparateur.compare_nokogiri_html(a_nok, b_nok)
40
+
41
+ # Compare URLs
42
+ LeComparateur.compare_urls(google_url, duckduck_url) # 0.3815789473684211
43
+
44
+ # Compare content
45
+ LeComparateur.compare_content(a, b)
46
+
47
+ # Directly compare arrays of nodes
48
+ LeComparateur.lcs(a_arr, b_arr)
data/lib/comparateur.rb CHANGED
@@ -1,20 +1,43 @@
1
1
  require "comparateur/version"
2
2
  require "nokogiri"
3
3
  require "diff-lcs"
4
+ require "open-uri"
4
5
 
5
- module Le
6
- class Comparateur
6
+ module Comparateur
7
7
 
8
- def calculate_similarity site1, site2
9
- site1 = Nokogiri::HTML(site1) unless site1.is_a?(Nokogiri::HTML::Document)
10
- site2 = Nokogiri::HTML(site2) unless site2.is_a?(Nokogiri::HTML::Document)
8
+ def serialize_nokogiri_html nokogiri_html
9
+ nokogiri_html.search('*').map(&:name)
10
+ end
11
+
12
+ def serialize_url url
13
+ Nokogiri::HTML(open(url)).search('*').map(&:name)
14
+ end
15
+
16
+ def serialize_content str
17
+ Nokogiri::HTML(str).search('*').map(&:name)
18
+ end
11
19
 
12
- arr_site1 = site1.search('*').map(&:name)
13
- arr_site2 = site2.search('*').map(&:name)
14
- lcs = Diff::LCS.LCS(arr_site1, arr_site2)
20
+ def compare_nokogiri_html nok1, nok2
21
+ s1 = serialize_nokogiri_html(nok1)
22
+ s2 = serialize_nokogiri_html(nok2)
23
+ lcs(s1, s2)
24
+ end
15
25
 
16
- return (2.0 * lcs.length.to_f) / (arr_site1.length.to_f + arr_site2.length.to_f)
17
- end
26
+ def compare_urls url1, url2
27
+ s1 = serialize_url(url1)
28
+ s2 = serialize_url(url2)
29
+ lcs(s1, s2)
30
+ end
18
31
 
32
+ def compare_content str1, str2
33
+ s1 = serialize_content(str1)
34
+ s2 = serialize_content(str2)
35
+ lcs(s1, s2)
19
36
  end
37
+
38
+ def lcs arr1, arr2
39
+ lcs = Diff::LCS.LCS(arr1, arr2)
40
+ return (2.0 * lcs.length.to_f) / (arr1.length.to_f + arr2.length.to_f)
41
+ end
42
+
20
43
  end
@@ -1,3 +1,3 @@
1
1
  module Comparateur
2
- VERSION = "1.0.2"
2
+ VERSION = "2.0.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: comparateur
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Radu-Bogdan Croitoru
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-12-04 00:00:00.000000000 Z
11
+ date: 2014-12-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -78,8 +78,8 @@ files:
78
78
  - LICENSE.txt
79
79
  - README.md
80
80
  - Rakefile
81
- - a.rb
82
81
  - comparateur.gemspec
82
+ - examples/a.rb
83
83
  - lib/comparateur.rb
84
84
  - lib/comparateur/version.rb
85
85
  homepage: https://github.com/radubogdan/ruby-comparateur
data/a.rb DELETED
@@ -1,11 +0,0 @@
1
- load "lib/comparateur.rb"
2
-
3
- exp = Le::Comparateur.new
4
-
5
- #site1 = Nokogiri::HTML("<html><body><h1></h1><h2></h2></body></html>")
6
-
7
-
8
- site1 = ["html", "body"]
9
- site2 = Nokogiri::HTML("<html><body></body></html>")
10
-
11
- p exp.calculate_similarity(site1, site2)