jules 0.0.1 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +60 -29
- data/lib/enumerable/sugar.rb +24 -0
- data/lib/jules.rb +160 -14
- data/lib/jules/version.rb +1 -1
- data/lib/nokogiri/sugar.rb +19 -19
- data/lib/simhash/sugar.rb +19 -0
- metadata +21 -40
- data/LICENSE +0 -22
- data/lib/damerau_levenshtein/sugar.rb +0 -9
- data/lib/jules/abstractions/list.rb +0 -10
- data/lib/jules/abstractions/title.rb +0 -21
- data/lib/jules/document.rb +0 -23
- data/lib/jules/miners/lists.rb +0 -114
- data/lib/jules/miners/titles.rb +0 -19
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f4324208ffa6000790da1143ead1d0fa1c460dc5
|
4
|
+
data.tar.gz: ccd6ca4f08cf1b3e726b734bfed6b835ce8fa24d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5cb30e73c092708515da863d4cbe399bf56b4f18192c506207c3dd922754cd805f1073b5ba7671289d847d9eac988a58caee43ce9df67c2b582c4de4776e9c55
|
7
|
+
data.tar.gz: 9d715e3dadd73bcc2636e25d740becbbfb081dbd1ab8a978ed0920cd3b74b8ca9fe368a41fd89cc399c8811bce33935db8c832823924ce958f211cdfddfd8275
|
data/README.md
CHANGED
@@ -1,40 +1,71 @@
|
|
1
|
-
|
1
|
+
# jules
|
2
|
+
> Experimental
|
2
3
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
Jules uses semantics, patterns and NLP to find data, so you don't have to specify exactly where it is. You'll no longer have to make different scrapers for every new website you want to scrape.
|
4
|
+
A fast new way to write scrapers.
|
5
|
+
This is still an ongoing project.
|
7
6
|
|
8
7
|
~~~ruby
|
9
|
-
|
8
|
+
require 'open-uri'
|
9
|
+
require 'jules'
|
10
|
+
source = URI.parse('https://news.ycombinator.com').read
|
11
|
+
|
12
|
+
filters = {
|
13
|
+
title: 'td.title a',
|
14
|
+
comments: [/(\d+) comments/, :optional],
|
15
|
+
points: /(\d+) points/
|
16
|
+
}
|
17
|
+
|
18
|
+
items = Jules.collect(source, filters)
|
19
|
+
# [{title: '2 years with Angular', comments: '95', points: '245'},
|
20
|
+
# {title: 'PolarSSL is now a part of ARM', comments: '13', points: '48'},
|
21
|
+
# {title: 'My boys love 1986 computing', comments: '25', points: '105'},
|
22
|
+
# {title: 'Kill init by touching a bunch of files', comments: '66', points: '102'},
|
23
|
+
# ...
|
10
24
|
~~~
|
11
25
|
|
12
|
-
##
|
13
|
-
|
14
|
-
|
26
|
+
## How?
|
27
|
+
|
28
|
+
Jules uses the repitition of HTML structure. It rearranges the document using locality sensitive hashing (Simhash). It then continues to extract data using the user-specified filters.
|
29
|
+
|
30
|
+
## Filters
|
31
|
+
|
32
|
+
Filters can be
|
33
|
+
- Strings (CSS selector / XPath query)
|
34
|
+
- Regexp
|
35
|
+
- Anonymous methods (`lambda`)
|
36
|
+
|
37
|
+
By default filters are required fields. If a field is optional, mark it with `['#example', :optional]`.
|
38
|
+
|
39
|
+
## Options
|
40
|
+
|
41
|
+
### Enabled HTML elements
|
42
|
+
By default, `div`, `tr` or `li` are enabled HTML elements for repitition. If a website wraps every item in `ul` and `div` elements, do this:
|
43
|
+
|
15
44
|
~~~ruby
|
16
|
-
html
|
17
|
-
j = Jules::HTML(html)
|
18
|
-
lists = j.lists
|
45
|
+
Jules.collect(html, filters, ['ul', 'div'])
|
19
46
|
~~~
|
20
47
|
|
21
|
-
|
48
|
+
### Examples
|
49
|
+
#### The Onion
|
22
50
|
~~~ruby
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
51
|
+
require 'open-uri'
|
52
|
+
require 'jules'
|
53
|
+
source = URI.parse('http://www.theonion.com/search/?q=why').read
|
54
|
+
|
55
|
+
filters = {
|
56
|
+
title: 'h1',
|
57
|
+
pubdate: /(\d{2}\.\d{2}\.\d{2})/,
|
58
|
+
img: 'img'
|
59
|
+
}
|
60
|
+
|
61
|
+
items = Jules.collect(source, filters)
|
62
|
+
# [{title: 'Why Are We Leaving Facebook?', pubdate: '10.10.13', img: 'http://o.onionstatic.com/images/23/23823/16x9/350.jpg?0553'},
|
63
|
+
# {title: 'Why Are We Filing For Disability?', pubdate: '01.24.14', img: 'http://o.onionstatic.com/images/25/25070/16x9/350.jpg?8738'},
|
64
|
+
# {title: 'Why Are We Waiting To Have Children?', pubdate: '07.10.14', img: 'http://o.onionstatic.com/images/26/26746/16x9/350.jpg?7206'},
|
65
|
+
# {title: 'Why Are We Postponing The Wedding?', pubdate: '04.25.13', img: '/images/21/21801/16x9/350.jpg?8189'},
|
66
|
+
# {title: 'Why Are We Canceling Our Netflix Account?', pubdate: '01.09.14', img: 'http://o.onionstatic.com/images/24/24668/16x9/350.jpg?3803'},
|
67
|
+
# {title: 'Why Aren't We Watching The Olympics?', pubdate: '02.20.14', img: 'http://o.onionstatic.com/images/25/25345/16x9/350.jpg?2178'},
|
68
|
+
# ...
|
28
69
|
~~~
|
29
70
|
|
30
|
-
|
31
|
-
- Lists
|
32
|
-
- Titles
|
33
|
-
- Menus
|
34
|
-
|
35
|
-
### Jules Data Types
|
36
|
-
- Date *:date*
|
37
|
-
- Price *:price*
|
38
|
-
- Filesize *:filesize*
|
39
|
-
- Download url *:download_url*
|
40
|
-
- Telephone number *:telephone_number*
|
71
|
+
See the [tests](test/) folder for more examples.
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Enumerable
|
2
|
+
def each_with_previous
|
3
|
+
self.inject(nil){|prev, curr| yield prev, curr; curr}
|
4
|
+
self
|
5
|
+
end
|
6
|
+
end
|
7
|
+
|
8
|
+
class Array
|
9
|
+
def find_by_partial_hash(hash)
|
10
|
+
self.select { |h| h.includes_hash?(hash) }
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
class Hash
|
15
|
+
def includes_hash?(other)
|
16
|
+
included = true
|
17
|
+
|
18
|
+
other.each do |key, value|
|
19
|
+
included &= self[key] == other[key]
|
20
|
+
end
|
21
|
+
|
22
|
+
included
|
23
|
+
end
|
24
|
+
end
|
data/lib/jules.rb
CHANGED
@@ -1,25 +1,171 @@
|
|
1
1
|
require 'jules/version'
|
2
|
-
|
3
2
|
require 'nokogiri'
|
4
|
-
require '
|
5
|
-
require '
|
3
|
+
require 'simhash'
|
4
|
+
require 'descriptive_statistics'
|
6
5
|
|
7
6
|
require 'nokogiri/sugar'
|
8
|
-
require '
|
7
|
+
require 'enumerable/sugar'
|
8
|
+
require 'simhash/sugar'
|
9
9
|
|
10
|
-
|
11
|
-
|
10
|
+
module Jules
|
11
|
+
SIMILARITY_THRESHOLD = 0.6
|
12
|
+
SIMHASH_BITLENGTH = 128
|
13
|
+
ELEMENTS = ['div', 'li', 'tr', 'article']
|
12
14
|
|
13
|
-
|
14
|
-
|
15
|
+
def self.collect(html, filters, elements=Jules::ELEMENTS)
|
16
|
+
unless html.is_a?(String) || html.is_a?(File)
|
17
|
+
raise ArgumentError, 'html not a String or File'
|
18
|
+
end
|
15
19
|
|
16
|
-
|
20
|
+
raise ArgumentError, 'filters argument empty' if filters.nil?
|
21
|
+
raise ArgumentError, 'filters not a Hash' unless filters.is_a? Hash
|
22
|
+
raise ArgumentError, 'elements not an Array' unless elements.is_a? Array
|
17
23
|
|
18
|
-
|
19
|
-
|
20
|
-
|
24
|
+
document = Nokogiri::HTML(html)
|
25
|
+
trees = Jules.rearrange_trees(document, elements)
|
26
|
+
clusters = Jules.cluster_trees(trees)
|
27
|
+
clusters = Jules.grade_clusters(clusters, filters)
|
28
|
+
Jules.items(clusters, filters)
|
29
|
+
end
|
30
|
+
|
31
|
+
# Rearranges DOM trees with Simhash
|
32
|
+
def rearrange_trees(document, elements=Jules::ELEMENTS)
|
33
|
+
trees = []
|
34
|
+
document_length = document.inner_html.length
|
35
|
+
xpath = elements.map{ |x| '//' + x }.join('|')
|
36
|
+
document.search(xpath).each do |tree|
|
37
|
+
structure = Nokogiri::XML.remove_markup_outline(tree.to_outline).strip
|
38
|
+
if structure.empty?
|
39
|
+
tree.remove # This HTML tree does not contain any structure
|
40
|
+
next
|
41
|
+
end
|
42
|
+
|
43
|
+
# Tree should be smaller than 50% of document size
|
44
|
+
next if (tree.inner_html.length / document_length.to_f * 100) > 50
|
45
|
+
|
46
|
+
trees << {
|
47
|
+
node: tree,
|
48
|
+
depth: tree.depth,
|
49
|
+
simhash: structure.simhash(hashbits: Jules::SIMHASH_BITLENGTH),
|
50
|
+
index: tree.xpath('count(preceding-sibling::*)').to_i
|
51
|
+
}
|
52
|
+
end
|
53
|
+
trees.sort_by { |tree| tree[:simhash] }
|
54
|
+
end
|
55
|
+
module_function :rearrange_trees
|
56
|
+
|
57
|
+
# Cluster trees based on similarity
|
58
|
+
def cluster_trees(trees)
|
59
|
+
clusters, cluster = [], [trees[0]]
|
21
60
|
|
22
|
-
|
23
|
-
|
61
|
+
trees.each_with_previous do |prev, tree|
|
62
|
+
next if prev.nil? # first item
|
63
|
+
similarity = Simhash.similarity(prev[:simhash], tree[:simhash])
|
64
|
+
if similarity < Jules::SIMILARITY_THRESHOLD
|
65
|
+
clusters << cluster
|
66
|
+
cluster = [tree]
|
67
|
+
else
|
68
|
+
cluster << tree
|
69
|
+
end
|
70
|
+
end
|
71
|
+
clusters << cluster if cluster.count > 0
|
72
|
+
|
73
|
+
# Reject clusters that only contain 1 tree
|
74
|
+
clusters.reject { |cluster| cluster.count < 2 }
|
24
75
|
end
|
76
|
+
module_function :cluster_trees
|
77
|
+
|
78
|
+
# Grade clusters
|
79
|
+
def grade_clusters(clusters, filters)
|
80
|
+
# Map trees inside cluster hash, to make room for metadata
|
81
|
+
clusters.map! {|cluster| {trees: cluster} }
|
82
|
+
|
83
|
+
clusters.each do |cluster|
|
84
|
+
cluster[:score] = 0
|
85
|
+
cluster[:trees].each do |tree|
|
86
|
+
tree = filter_item(tree, filters)
|
87
|
+
|
88
|
+
cluster[:score] += tree[:item].to_h.count
|
89
|
+
end
|
90
|
+
cluster[:items] = cluster[:trees]
|
91
|
+
.sort_by { |tree| tree[:index] }
|
92
|
+
.map { |tree| tree[:item] }
|
93
|
+
.reject { |item| item.nil? }
|
94
|
+
|
95
|
+
# Bonus points if all trees are at same depth
|
96
|
+
depth_sd = cluster[:trees].map { |tree| tree[:depth] }.standard_deviation
|
97
|
+
cluster[:score] += 1 if depth_sd < 0.5
|
98
|
+
cluster[:score_ratio] = cluster[:score] / cluster[:trees].count.to_f
|
99
|
+
end
|
100
|
+
|
101
|
+
clusters
|
102
|
+
.reject{ |cluster| cluster[:items].count == 0 }
|
103
|
+
.sort_by { |cluster| cluster[:score] }
|
104
|
+
end
|
105
|
+
module_function :grade_clusters
|
106
|
+
|
107
|
+
# Try to find a single item in DOM tree
|
108
|
+
def filter_item(tree, filters)
|
109
|
+
filters.each do |key, filter|
|
110
|
+
filter = filter[0] if filter.class == Array # TODO
|
111
|
+
|
112
|
+
case filter
|
113
|
+
when String
|
114
|
+
value = tree[:node].at(filter)
|
115
|
+
if value
|
116
|
+
tree[:item] ||= {}
|
117
|
+
if value.text.empty?
|
118
|
+
tree[:item][key] = value['src'] || value['href']
|
119
|
+
else
|
120
|
+
tree[:item][key] = value.text
|
121
|
+
end
|
122
|
+
end
|
123
|
+
when Regexp
|
124
|
+
match = filter.match(tree[:node].inner_html)
|
125
|
+
if match && match.captures
|
126
|
+
tree[:item] ||= {}
|
127
|
+
tree[:item][key] = match.captures[0]
|
128
|
+
end
|
129
|
+
when Proc
|
130
|
+
tree[:item][key] = filter.call(tree[:node])
|
131
|
+
else
|
132
|
+
raise ArgumentError, "#{filter} is not a valid filter type"
|
133
|
+
end
|
134
|
+
end
|
135
|
+
tree
|
136
|
+
end
|
137
|
+
module_function :filter_item
|
138
|
+
|
139
|
+
# Pick items from best cluster, or combine items from multiple clusters
|
140
|
+
def items(clusters, filters)
|
141
|
+
# Clusters need to have at least one item field per tree
|
142
|
+
clusters
|
143
|
+
.select!{|cluster| cluster[:score_ratio] >= 1.0 }
|
144
|
+
|
145
|
+
return [] if clusters.to_a.count == 0
|
146
|
+
|
147
|
+
# Find unique items, start with highest scoring clusters
|
148
|
+
items = []
|
149
|
+
|
150
|
+
# Pick two best groups
|
151
|
+
clusters = clusters.sort_by {|cluster| cluster[:score_ratio]}.reverse[0, 2]
|
152
|
+
clusters.each do |cluster|
|
153
|
+
cluster[:items].each do |item|
|
154
|
+
items << item
|
155
|
+
end
|
156
|
+
end
|
157
|
+
items.uniq!
|
158
|
+
|
159
|
+
# Keep best items of partial duplicates
|
160
|
+
items.delete_if do |item|
|
161
|
+
items.find_by_partial_hash(item).map(&:count).max > item.count
|
162
|
+
end
|
163
|
+
|
164
|
+
return items
|
165
|
+
end
|
166
|
+
module_function :items
|
167
|
+
|
168
|
+
|
169
|
+
# Helper methods
|
170
|
+
def self.simhash(data); data.simhash(hashbits: Jules::SIMHASH_BITLENGTH); end
|
25
171
|
end
|
data/lib/jules/version.rb
CHANGED
data/lib/nokogiri/sugar.rb
CHANGED
@@ -1,33 +1,33 @@
|
|
1
1
|
# Add some sugar to Nokogiri
|
2
|
-
# http://stackoverflow.com/questions/7176094/how-do-i-create-an-outline-of-the-html-tag-structure-on-the-page-using-nokogiri
|
3
|
-
# http://stackoverflow.com/questions/5694759/how-do-you-calculate-the-number-of-levels-of-descendants-of-a-nokogiri-node
|
4
2
|
class Nokogiri::XML::Node
|
5
3
|
def to_outline
|
6
4
|
children.find_all(&:element?).map(&:to_outline).join
|
7
5
|
end
|
8
6
|
def depth
|
9
7
|
ancestors.size
|
10
|
-
# The following is ~10x slower: xpath('count(ancestor::node())').to_i
|
11
|
-
end
|
12
|
-
def leaves
|
13
|
-
xpath('.//*[not(*)]').to_a
|
14
|
-
end
|
15
|
-
def height
|
16
|
-
tallest = leaves.map{ |leaf| leaf.depth }.max
|
17
|
-
tallest ? tallest - depth : 0
|
18
|
-
end
|
19
|
-
def deepest_leaves
|
20
|
-
by_height = leaves.group_by{ |leaf| leaf.depth }
|
21
|
-
by_height[ by_height.keys.max ]
|
22
|
-
end
|
23
|
-
def deepest_level
|
24
|
-
by_height = leaves.group_by{ |leaf| leaf.depth }
|
25
|
-
by_height.keys.max
|
26
8
|
end
|
27
9
|
end
|
28
10
|
|
29
11
|
class Nokogiri::XML::Element
|
30
12
|
def to_outline
|
31
|
-
|
13
|
+
if self['id'] && self['class']
|
14
|
+
"<#{name} id=\"#{self['id']}\" class=\"#{self['class']}\">#{super}</#{name}>"
|
15
|
+
elsif self['id']
|
16
|
+
"<#{name} id=\"#{self['id']}\">#{super}</#{name}>"
|
17
|
+
elsif self['class']
|
18
|
+
"<#{name} class=\"#{self['class']}\">#{super}</#{name}>"
|
19
|
+
else
|
20
|
+
"<#{name}>#{super}</#{name}>"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
module Nokogiri::XML
|
26
|
+
def self.remove_markup_outline(str)
|
27
|
+
f = Nokogiri::XML.fragment(str)
|
28
|
+
['b', 'strong', 'strike', 'i', 'u'].each do |s|
|
29
|
+
f.search('.//' + s).remove
|
30
|
+
end
|
31
|
+
f.to_outline
|
32
32
|
end
|
33
33
|
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Simhash
|
2
|
+
def self.hamming_distance(hash1, hash2)
|
3
|
+
(hash1 ^ hash2).to_s(2).count('1')
|
4
|
+
end
|
5
|
+
|
6
|
+
def self.similarity(hash1, hash2)
|
7
|
+
1 - (Simhash.hamming_distance(hash1, hash2) / Jules::SIMHASH_BITLENGTH.to_f)
|
8
|
+
end
|
9
|
+
|
10
|
+
# Bitwise left rotate
|
11
|
+
def self.lotate(hash, n=1)
|
12
|
+
(hash << n | hash >> (Jules::SIMHASH_BITLENGTH - n)) &
|
13
|
+
('1'*Jules::SIMHASH_BITLENGTH).to_i(2)
|
14
|
+
end
|
15
|
+
|
16
|
+
# Cluster
|
17
|
+
def self.cluster(simhashes, threshold)
|
18
|
+
end
|
19
|
+
end
|
metadata
CHANGED
@@ -1,89 +1,70 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jules
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Bart Olsthoorn
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-11-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '1.6'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '1.6'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: simhash
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - "
|
31
|
+
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '0'
|
33
|
+
version: '0.2'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - "
|
38
|
+
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '0'
|
40
|
+
version: '0.2'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: descriptive_statistics
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- - "
|
45
|
+
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
47
|
+
version: '2.4'
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- - "
|
52
|
+
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
55
|
-
|
56
|
-
name: rspec
|
57
|
-
requirement: !ruby/object:Gem::Requirement
|
58
|
-
requirements:
|
59
|
-
- - ">="
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: '0'
|
62
|
-
type: :development
|
63
|
-
prerelease: false
|
64
|
-
version_requirements: !ruby/object:Gem::Requirement
|
65
|
-
requirements:
|
66
|
-
- - ">="
|
67
|
-
- !ruby/object:Gem::Version
|
68
|
-
version: '0'
|
69
|
-
description: High level data mining scraper using patterns, semantics and NLP.
|
54
|
+
version: '2.4'
|
55
|
+
description: Data mining scraper using local hashing.
|
70
56
|
email:
|
71
57
|
- bartolsthoorn@gmail.com
|
72
58
|
executables: []
|
73
59
|
extensions: []
|
74
60
|
extra_rdoc_files: []
|
75
61
|
files:
|
76
|
-
- LICENSE
|
77
62
|
- README.md
|
78
|
-
- lib/
|
63
|
+
- lib/enumerable/sugar.rb
|
79
64
|
- lib/jules.rb
|
80
|
-
- lib/jules/abstractions/list.rb
|
81
|
-
- lib/jules/abstractions/title.rb
|
82
|
-
- lib/jules/document.rb
|
83
|
-
- lib/jules/miners/lists.rb
|
84
|
-
- lib/jules/miners/titles.rb
|
85
65
|
- lib/jules/version.rb
|
86
66
|
- lib/nokogiri/sugar.rb
|
67
|
+
- lib/simhash/sugar.rb
|
87
68
|
homepage: http://github.com/bartolsthoorn/jules
|
88
69
|
licenses:
|
89
70
|
- MIT
|
@@ -104,8 +85,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
104
85
|
version: '0'
|
105
86
|
requirements: []
|
106
87
|
rubyforge_project:
|
107
|
-
rubygems_version: 2.
|
88
|
+
rubygems_version: 2.4.4
|
108
89
|
signing_key:
|
109
90
|
specification_version: 4
|
110
|
-
summary:
|
91
|
+
summary: Data mining scraper using local hashing.
|
111
92
|
test_files: []
|
data/LICENSE
DELETED
@@ -1,22 +0,0 @@
|
|
1
|
-
Copyright (c) 2014 Bart Olsthoorn, website: bartolsthoorn.nl
|
2
|
-
|
3
|
-
Permission is hereby granted, free of charge, to any person
|
4
|
-
obtaining a copy of this software and associated documentation
|
5
|
-
files (the "Software"), to deal in the Software without
|
6
|
-
restriction, including without limitation the rights to use,
|
7
|
-
copy, modify, merge, publish, distribute, sublicense, and/or sell
|
8
|
-
copies of the Software, and to permit persons to whom the
|
9
|
-
Software is furnished to do so, subject to the following
|
10
|
-
conditions:
|
11
|
-
|
12
|
-
The above copyright notice and this permission notice shall be
|
13
|
-
included in all copies or substantial portions of the Software.
|
14
|
-
|
15
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
-
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
17
|
-
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
-
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
19
|
-
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
20
|
-
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
21
|
-
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
22
|
-
OTHER DEALINGS IN THE SOFTWARE.
|
@@ -1,9 +0,0 @@
|
|
1
|
-
module DamerauLevenshtein
|
2
|
-
# returns 1.0 for completely different strings
|
3
|
-
# returns 0.0 for completely identical strings
|
4
|
-
def self.relative(a, b)
|
5
|
-
length = [a.length, b.length].max
|
6
|
-
return DamerauLevenshtein.distance(a, b).to_f / length
|
7
|
-
end
|
8
|
-
end
|
9
|
-
DL = DamerauLevenshtein
|
@@ -1,21 +0,0 @@
|
|
1
|
-
module Jules
|
2
|
-
module Abstractions
|
3
|
-
class Title
|
4
|
-
attr_accessor :level, :text, :language
|
5
|
-
|
6
|
-
def initialize(level, text)
|
7
|
-
raise ArgumentError if level.class != Fixnum
|
8
|
-
raise ArgumentError if text.class != String
|
9
|
-
|
10
|
-
# H1 means level 1, etc.
|
11
|
-
@level = level
|
12
|
-
|
13
|
-
# Name contains the actual title data
|
14
|
-
@text = text
|
15
|
-
|
16
|
-
# Language detection
|
17
|
-
@language = text.language
|
18
|
-
end
|
19
|
-
end
|
20
|
-
end
|
21
|
-
end
|
data/lib/jules/document.rb
DELETED
@@ -1,23 +0,0 @@
|
|
1
|
-
module Jules
|
2
|
-
class << self
|
3
|
-
def HTML(html, options = {})
|
4
|
-
raise ArgumentError if html.class != String
|
5
|
-
Jules::Document.new html
|
6
|
-
end
|
7
|
-
end
|
8
|
-
|
9
|
-
class Document
|
10
|
-
attr_accessor :html
|
11
|
-
|
12
|
-
def initialize(html)
|
13
|
-
@html = Nokogiri::HTML::Document.parse html
|
14
|
-
end
|
15
|
-
|
16
|
-
def titles
|
17
|
-
Jules::Miners.titles @html
|
18
|
-
end
|
19
|
-
def lists
|
20
|
-
Jules::Miners.lists @html
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
data/lib/jules/miners/lists.rb
DELETED
@@ -1,114 +0,0 @@
|
|
1
|
-
module Jules
|
2
|
-
module Miners
|
3
|
-
class << self
|
4
|
-
def zebra_list?(list_items)
|
5
|
-
# Use outlines to see if lists are structured like
|
6
|
-
# AAAAAA (stride 0)
|
7
|
-
# ABABAB (stride 1)
|
8
|
-
# AABBAABB (stride 2)
|
9
|
-
# AAABBBAAABBB (stride 3)
|
10
|
-
|
11
|
-
outlines = list_items.map(&:to_outline)
|
12
|
-
|
13
|
-
# First test for non_zebra AAAAA
|
14
|
-
errors = []
|
15
|
-
outlines.each_with_index do |outline, i|
|
16
|
-
previous_outline = outlines[i - 1]
|
17
|
-
errors << DL.relative(outline, previous_outline)
|
18
|
-
end
|
19
|
-
avg_error = errors.inject(:+) / errors.size
|
20
|
-
stride0_certainty = 1 - avg_error
|
21
|
-
if stride0_certainty == 1.0
|
22
|
-
return {stride: 0, certainty: stride0_certainty }
|
23
|
-
end
|
24
|
-
|
25
|
-
# Not certain it's AAAA, so continue to check for ABABAB
|
26
|
-
errors = []
|
27
|
-
outlines.each_with_index do |outline, i|
|
28
|
-
before_outline = outlines[i - 2]
|
29
|
-
previous_outline = outlines[i - 1]
|
30
|
-
next_outline = outlines[i + 1]
|
31
|
-
|
32
|
-
if previous_outline && next_outline
|
33
|
-
zebra_1 = DL.relative(previous_outline, next_outline)
|
34
|
-
zebra_2 = DL.relative(outline, before_outline)
|
35
|
-
# zebra should be close to 0.0
|
36
|
-
errors << (zebra_1 + zebra_2) / 2
|
37
|
-
end
|
38
|
-
end
|
39
|
-
avg_error = errors.inject(:+) / errors.size
|
40
|
-
stride1_certainty = 1 - avg_error
|
41
|
-
if stride1_certainty > stride0_certainty
|
42
|
-
{stride: 1, certainty: stride1_certainty}
|
43
|
-
else
|
44
|
-
{stride: 0, certainty: stride0_certainty}
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
def unzebrify(result)
|
49
|
-
result.each_with_index do |list, r|
|
50
|
-
nodes = list[:items].map{ |item| item[:node] }
|
51
|
-
zebra = zebra_list?(nodes)
|
52
|
-
|
53
|
-
if zebra[:stride] == 1
|
54
|
-
puts 'ZEBRA!'
|
55
|
-
# Merge nodes
|
56
|
-
#list[:items].each_with_index do |item, i|
|
57
|
-
# list[:items][i+1][:node] = [
|
58
|
-
# list[:items][i][:node],
|
59
|
-
# list[:items][i+1][:node]
|
60
|
-
# ]
|
61
|
-
# list[:items].delete_at(i)
|
62
|
-
#end
|
63
|
-
end
|
64
|
-
end
|
65
|
-
result
|
66
|
-
end
|
67
|
-
|
68
|
-
def lists(html)
|
69
|
-
depth = html.deepest_level
|
70
|
-
result = []
|
71
|
-
|
72
|
-
depth.times do |level|
|
73
|
-
xpath = '/*' * (level + 1)
|
74
|
-
nodes = html.xpath(xpath)
|
75
|
-
items = []
|
76
|
-
last_node = nodes.first
|
77
|
-
nodes.each do |node|
|
78
|
-
next unless [:li, :div, :tr].include? node.name.to_sym
|
79
|
-
|
80
|
-
if items.last && items.last[:node].name != node.name
|
81
|
-
# Store items as collection when 2 or more found
|
82
|
-
if items.count > 1
|
83
|
-
result << {
|
84
|
-
level: level,
|
85
|
-
items: items }
|
86
|
-
end
|
87
|
-
items = []
|
88
|
-
end
|
89
|
-
|
90
|
-
# Node is same element family as previous node
|
91
|
-
# But it could still be part of a one multiple node item
|
92
|
-
if items.last
|
93
|
-
# Is current node different from the previous one?
|
94
|
-
if items.last[:node].to_outline != node.to_outline
|
95
|
-
# Next node outline same as previous node outline?
|
96
|
-
end
|
97
|
-
end
|
98
|
-
items << {
|
99
|
-
titles: Jules::Miners.titles(node),
|
100
|
-
node: node,
|
101
|
-
text: node.text
|
102
|
-
}
|
103
|
-
end
|
104
|
-
if items.count > 1
|
105
|
-
result << {
|
106
|
-
level: level,
|
107
|
-
items: items }
|
108
|
-
end
|
109
|
-
end
|
110
|
-
unzebrify(result)
|
111
|
-
end
|
112
|
-
end
|
113
|
-
end
|
114
|
-
end
|
data/lib/jules/miners/titles.rb
DELETED
@@ -1,19 +0,0 @@
|
|
1
|
-
module Jules
|
2
|
-
module Miners
|
3
|
-
class << self
|
4
|
-
def titles(html)
|
5
|
-
titles = []
|
6
|
-
|
7
|
-
10.times do |i|
|
8
|
-
level = i + 1
|
9
|
-
html.xpath('.//h' + level.to_s).each do |title|
|
10
|
-
name = title.text
|
11
|
-
titles << Jules::Abstractions::Title.new(level, name)
|
12
|
-
end
|
13
|
-
end
|
14
|
-
|
15
|
-
titles
|
16
|
-
end
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|