comparateur 1.0.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 29c929d2e8f2a89a4a4a36bcd412819a0feaf171
4
- data.tar.gz: 4287a132f615bb35cff1c4e49da0697387aae91d
3
+ metadata.gz: 3396baff4b80834299a15ffabeb73748ef16c6f5
4
+ data.tar.gz: 9d1fca09a2658a2c3ac0de11f29846040744d912
5
5
  SHA512:
6
- metadata.gz: ad03801f73f44f3e89d95d9591f569f7eb7a82f3e89f3512e6d88d00a3a8b3aaefdd1a4721a1514e583ea38e9300bb183225f766d9380d70541a93c638a14950
7
- data.tar.gz: 883479951d98d6409d6637e05141f8c078c42dc9f766c20a9408bfd80758c2c493ce37f941d754fbca607d0c0d80e52de6bc08b6ba2443efb1982d00d6d1d00d
6
+ metadata.gz: 13e9616ca619fd82cdbfe5f1625663a0da7fd9a72bf1a1cbf9f70ff18e481773b6963e3cdafa5c3399decca0aa4bacfae47f0decdc2af652ab45dbc77b3163df
7
+ data.tar.gz: df66d064b6d2446d47225c381c64a37e395196fedba86e0b6af0e4941074e5acca107369ba7a5e1a497c13b7594f65546c75153f8f7c894dc932ed04c5ef663c
data/README.md CHANGED
@@ -1,6 +1,17 @@
1
1
  # Comparateur
2
2
 
3
- Calculate the structural similarity between two HTML documents
3
+ [![Gem Version](https://badge.fury.io/rb/comparateur.svg)](http://badge.fury.io/rb/comparateur) [![comparateur Downloads](http://www.gemetric.me/images/comparateur.gif)](https://rubygems.org/gems/comparateur)
4
+
5
+ Calculate the structural similarity between two HTML documents.
6
+
7
+ **How it works**
8
+ It serializes strings, Nokogiri::HTML objects and URLs to arrays containing node's tag names and finds the longest common sequence between two serialized arrays.
9
+
10
+ The similarity is measured with the formula:
11
+ `2 * length(LCS Array) / (length(TreeA) + length(treeB))`
12
+
13
+ **How is done**
14
+ Classes are about objects and Modules are about functions. That's why you have to create a class and `include` or `extend` it with `Comparateur` and use it as you like. This implementation also let you built your own cache system.
4
15
 
5
16
  ## Installation
6
17
 
@@ -21,18 +32,48 @@ Or install it yourself as:
21
32
  ```ruby
22
33
  require 'comparateur'
23
34
 
24
- exp = Le::Comparateur.new
35
+ class LeComparateur
36
+ extend Comparateur
37
+ end
25
38
 
26
- a = "<html><body></body></html>"
27
- b = "<html><body><h1></h1></body></html>"
39
+ google_url = "http://google.com"
40
+ duckduck_url = "https://duckduckgo.com"
28
41
 
29
- c = Nokogiri::HTML("<html><body></body></html>")
30
- d = a
31
-
32
- p exp.calculate_similarity(a, b) * 100 # in %
33
- p exp.calculate_similarity(c, d)
42
+ LeComparateur.compare_urls(google_url, duckduck_url) # 0.3815789473684211
34
43
  ```
35
44
 
45
+ Example of usage [here](https://raw.githubusercontent.com/radubogdan/ruby-comparateur/master/examples/a.rb)
46
+
47
+ ## Methods
48
+
49
+ `serialize_nokogiri_html(obj1)`
50
+ - `obj1`: Nokogiri::HTML object.
51
+ - `return`: Array containing node's tag names.
52
+
53
+ `serialize_url(url)`
54
+ - `url`: URL of the website.
55
+ - `return`: Array containing node's tag names.
56
+
57
+ `serialize_content(str)`
58
+ - `str`: String containing the html.
59
+ - `return`: Array containing node's tag names.
60
+
61
+ `compare_nokogiri_html(nok1, nok2)`
62
+ - `nok1, nok2`: Nokogiri::HTML objects.
63
+ - `return`: Score (0-1).
64
+
65
+ `compare_urls(url1, url2)`
66
+ - `url1, url2`: URL of two different websites.
67
+ - `return`: Score (0-1).
68
+
69
+ `compare_content(str1, str2)`
70
+ - `str1, str2`: First and second string which contain the html.
71
+ - `return`: Score (0-1).
72
+
73
+ `lcs(arr1, arr2)`
74
+ - `arr1, arr2`: First and second array which contain the node's tag names.
75
+ - `return`: Score (0-1)
76
+
36
77
  ## Contributing
37
78
 
38
79
  1. Fork it ( https://github.com/radubogdan/ruby-comparateur/fork )
data/examples/a.rb ADDED
@@ -0,0 +1,48 @@
1
+ load "./lib/comparateur.rb" # use require comparateur
2
+
3
+ # Create your own class and add
4
+ # Comparateur's functions as class methods
5
+ class LeComparateur
6
+ extend Comparateur
7
+ end
8
+
9
+ # Create your class and include all
10
+ # functions as instance methods
11
+ class MyClass
12
+ include Comparateur
13
+ end
14
+
15
+ # All examples use these
16
+ a = "<html><body><ul><li><ul><li></li></ul></li></ul></body></html"
17
+ b = "<html><body><ul><li></li></ul></body></html"
18
+ google_url = "http://google.com"
19
+ duckduck_url = "https://duckduckgo.com"
20
+ a_nok = Nokogiri::HTML(a)
21
+ b_nok = Nokogiri::HTML(b)
22
+ a_arr = %w(html body ul li ul li)
23
+ b_arr = %w(html body ul li)
24
+
25
+ # Serialize Nokogiri::HTML objects
26
+ # return: Array of HTML nodes
27
+ LeComparateur.serialize_nokogiri_html(a_nok) # ["html", "body", "ul", "li", "ul", "li"]
28
+
29
+ # Serialize URL
30
+ # return: Array of HTML nodes
31
+ LeComparateur.serialize_url(google_url) # Long array of nodes used in google.com
32
+
33
+ # Serialize Content
34
+ # return: Array of HTML nodes
35
+ LeComparateur.serialize_content(a) # ["html", "body", "ul", "li", "ul", "li"]
36
+ LeComparateur.serialize_content(b) # ["html", "body", "ul", "li"]
37
+
38
+ # Compare Nokogiri::HTML objects
39
+ LeComparateur.compare_nokogiri_html(a_nok, b_nok)
40
+
41
+ # Compare URLs
42
+ LeComparateur.compare_urls(google_url, duckduck_url) # 0.3815789473684211
43
+
44
+ # Compare content
45
+ LeComparateur.compare_content(a, b)
46
+
47
+ # Directly compare arrays of nodes
48
+ LeComparateur.lcs(a_arr, b_arr)
data/lib/comparateur.rb CHANGED
@@ -1,20 +1,43 @@
1
1
  require "comparateur/version"
2
2
  require "nokogiri"
3
3
  require "diff-lcs"
4
+ require "open-uri"
4
5
 
5
- module Le
6
- class Comparateur
6
+ module Comparateur
7
7
 
8
- def calculate_similarity site1, site2
9
- site1 = Nokogiri::HTML(site1) unless site1.is_a?(Nokogiri::HTML::Document)
10
- site2 = Nokogiri::HTML(site2) unless site2.is_a?(Nokogiri::HTML::Document)
8
+ def serialize_nokogiri_html nokogiri_html
9
+ nokogiri_html.search('*').map(&:name)
10
+ end
11
+
12
+ def serialize_url url
13
+ Nokogiri::HTML(open(url)).search('*').map(&:name)
14
+ end
15
+
16
+ def serialize_content str
17
+ Nokogiri::HTML(str).search('*').map(&:name)
18
+ end
11
19
 
12
- arr_site1 = site1.search('*').map(&:name)
13
- arr_site2 = site2.search('*').map(&:name)
14
- lcs = Diff::LCS.LCS(arr_site1, arr_site2)
20
+ def compare_nokogiri_html nok1, nok2
21
+ s1 = serialize_nokogiri_html(nok1)
22
+ s2 = serialize_nokogiri_html(nok2)
23
+ lcs(s1, s2)
24
+ end
15
25
 
16
- return (2.0 * lcs.length.to_f) / (arr_site1.length.to_f + arr_site2.length.to_f)
17
- end
26
+ def compare_urls url1, url2
27
+ s1 = serialize_url(url1)
28
+ s2 = serialize_url(url2)
29
+ lcs(s1, s2)
30
+ end
18
31
 
32
+ def compare_content str1, str2
33
+ s1 = serialize_content(str1)
34
+ s2 = serialize_content(str2)
35
+ lcs(s1, s2)
19
36
  end
37
+
38
+ def lcs arr1, arr2
39
+ lcs = Diff::LCS.LCS(arr1, arr2)
40
+ return (2.0 * lcs.length.to_f) / (arr1.length.to_f + arr2.length.to_f)
41
+ end
42
+
20
43
  end
@@ -1,3 +1,3 @@
1
1
  module Comparateur
2
- VERSION = "1.0.2"
2
+ VERSION = "2.0.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: comparateur
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Radu-Bogdan Croitoru
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-12-04 00:00:00.000000000 Z
11
+ date: 2014-12-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -78,8 +78,8 @@ files:
78
78
  - LICENSE.txt
79
79
  - README.md
80
80
  - Rakefile
81
- - a.rb
82
81
  - comparateur.gemspec
82
+ - examples/a.rb
83
83
  - lib/comparateur.rb
84
84
  - lib/comparateur/version.rb
85
85
  homepage: https://github.com/radubogdan/ruby-comparateur
data/a.rb DELETED
@@ -1,11 +0,0 @@
1
- load "lib/comparateur.rb"
2
-
3
- exp = Le::Comparateur.new
4
-
5
- #site1 = Nokogiri::HTML("<html><body><h1></h1><h2></h2></body></html>")
6
-
7
-
8
- site1 = ["html", "body"]
9
- site2 = Nokogiri::HTML("<html><body></body></html>")
10
-
11
- p exp.calculate_similarity(site1, site2)