jules 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +22 -0
- data/README.md +40 -0
- data/lib/damerau_levenshtein/sugar.rb +9 -0
- data/lib/jules.rb +25 -0
- data/lib/jules/abstractions/list.rb +10 -0
- data/lib/jules/abstractions/title.rb +21 -0
- data/lib/jules/document.rb +23 -0
- data/lib/jules/miners/lists.rb +114 -0
- data/lib/jules/miners/titles.rb +19 -0
- data/lib/jules/version.rb +3 -0
- data/lib/nokogiri/sugar.rb +33 -0
- metadata +111 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: f646200a27845170536bad15df78cd74f9a5fca4
|
4
|
+
data.tar.gz: c456c90b2e122bb9598323b751e6066671c21f1a
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 5ab95e69f9e86dc1090bc9cf96be3439f00cd6af2a69225c100851ad54a53c830761e7b597832bd771d6b9820a4383145e56360266ac02a2f1518849bf7c314e
|
7
|
+
data.tar.gz: 041c7c4b5135becebb8d4e925a0f7035e27bf7ec3b9b806c8c43e3ddaa8aecc33a204841538471bc09bbb07004e1c2c6afdb8fb8f7b536eac4af3edfa476db82
|
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Bart Olsthoorn, website: bartolsthoorn.nl
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person
|
4
|
+
obtaining a copy of this software and associated documentation
|
5
|
+
files (the "Software"), to deal in the Software without
|
6
|
+
restriction, including without limitation the rights to use,
|
7
|
+
copy, modify, merge, publish, distribute, sublicense, and/or sell
|
8
|
+
copies of the Software, and to permit persons to whom the
|
9
|
+
Software is furnished to do so, subject to the following
|
10
|
+
conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be
|
13
|
+
included in all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
17
|
+
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
19
|
+
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
20
|
+
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
21
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
22
|
+
OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
**Hi!** - Don't use this gem _yet_, it's still very much being developed.
|
2
|
+
|
3
|
+
# Jules
|
4
|
+
A data mining scraper with a high level of abstraction. It's capable of finding _lists_, _menus_, _titles_ and _contact data_.
|
5
|
+
|
6
|
+
Jules uses semantics, patterns and NLP to find data, so you don't have to specify exactly where it is. You'll no longer have to make different scrapers for every new website you want to scrape.
|
7
|
+
|
8
|
+
~~~ruby
|
9
|
+
gem 'jules'
|
10
|
+
~~~
|
11
|
+
|
12
|
+
## Examples
|
13
|
+
The following examples show you how to use Jules.
|
14
|
+
### Lists
|
15
|
+
~~~ruby
|
16
|
+
html = File.open('web-page.html', 'rb') { |f| f.read }
|
17
|
+
j = Jules::HTML(html)
|
18
|
+
lists = j.lists
|
19
|
+
~~~
|
20
|
+
|
21
|
+
The following example gets lists only when they contain certain data types.
|
22
|
+
~~~ruby
|
23
|
+
j = Jules::HTML(html)
|
24
|
+
lists = j.lists(
|
25
|
+
required: [:date, :price],
|
26
|
+
optional: [:download_link]
|
27
|
+
)
|
28
|
+
~~~
|
29
|
+
|
30
|
+
### Jules Abstractions
|
31
|
+
- Lists
|
32
|
+
- Titles
|
33
|
+
- Menus
|
34
|
+
|
35
|
+
### Jules Data Types
|
36
|
+
- Date *:date*
|
37
|
+
- Price *:price*
|
38
|
+
- Filesize *:filesize*
|
39
|
+
- Download url *:download_url*
|
40
|
+
- Telephone number *:telephone_number*
|
@@ -0,0 +1,9 @@
|
|
1
|
+
module DamerauLevenshtein
|
2
|
+
# returns 1.0 for completely different strings
|
3
|
+
# returns 0.0 for completely identical strings
|
4
|
+
def self.relative(a, b)
|
5
|
+
length = [a.length, b.length].max
|
6
|
+
return DamerauLevenshtein.distance(a, b).to_f / length
|
7
|
+
end
|
8
|
+
end
|
9
|
+
DL = DamerauLevenshtein
|
data/lib/jules.rb
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'jules/version'
|
2
|
+
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'whatlanguage'
|
5
|
+
require 'damerau-levenshtein'
|
6
|
+
|
7
|
+
require 'nokogiri/sugar'
|
8
|
+
require 'damerau_levenshtein/sugar'
|
9
|
+
|
10
|
+
require 'jules/abstractions/list'
|
11
|
+
require 'jules/abstractions/title'
|
12
|
+
|
13
|
+
require 'jules/miners/titles'
|
14
|
+
require 'jules/miners/lists'
|
15
|
+
|
16
|
+
require 'jules/document'
|
17
|
+
|
18
|
+
module Jules
|
19
|
+
NAME = 'Jules'
|
20
|
+
LICENSE = 'See LICENSE for licensing details.'
|
21
|
+
|
22
|
+
def self.❨╯°□°❩╯︵┻━┻
|
23
|
+
puts 'Calm down, bro'
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module Jules
|
2
|
+
module Abstractions
|
3
|
+
class Title
|
4
|
+
attr_accessor :level, :text, :language
|
5
|
+
|
6
|
+
def initialize(level, text)
|
7
|
+
raise ArgumentError if level.class != Fixnum
|
8
|
+
raise ArgumentError if text.class != String
|
9
|
+
|
10
|
+
# H1 means level 1, etc.
|
11
|
+
@level = level
|
12
|
+
|
13
|
+
# Name contains the actual title data
|
14
|
+
@text = text
|
15
|
+
|
16
|
+
# Language detection
|
17
|
+
@language = text.language
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Jules
|
2
|
+
class << self
|
3
|
+
def HTML(html, options = {})
|
4
|
+
raise ArgumentError if html.class != String
|
5
|
+
Jules::Document.new html
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
class Document
|
10
|
+
attr_accessor :html
|
11
|
+
|
12
|
+
def initialize(html)
|
13
|
+
@html = Nokogiri::HTML::Document.parse html
|
14
|
+
end
|
15
|
+
|
16
|
+
def titles
|
17
|
+
Jules::Miners.titles @html
|
18
|
+
end
|
19
|
+
def lists
|
20
|
+
Jules::Miners.lists @html
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,114 @@
|
|
1
|
+
module Jules
|
2
|
+
module Miners
|
3
|
+
class << self
|
4
|
+
def zebra_list?(list_items)
|
5
|
+
# Use outlines to see if lists are structured like
|
6
|
+
# AAAAAA (stride 0)
|
7
|
+
# ABABAB (stride 1)
|
8
|
+
# AABBAABB (stride 2)
|
9
|
+
# AAABBBAAABBB (stride 3)
|
10
|
+
|
11
|
+
outlines = list_items.map(&:to_outline)
|
12
|
+
|
13
|
+
# First test for non_zebra AAAAA
|
14
|
+
errors = []
|
15
|
+
outlines.each_with_index do |outline, i|
|
16
|
+
previous_outline = outlines[i - 1]
|
17
|
+
errors << DL.relative(outline, previous_outline)
|
18
|
+
end
|
19
|
+
avg_error = errors.inject(:+) / errors.size
|
20
|
+
stride0_certainty = 1 - avg_error
|
21
|
+
if stride0_certainty == 1.0
|
22
|
+
return {stride: 0, certainty: stride0_certainty }
|
23
|
+
end
|
24
|
+
|
25
|
+
# Not certain it's AAAA, so continue to check for ABABAB
|
26
|
+
errors = []
|
27
|
+
outlines.each_with_index do |outline, i|
|
28
|
+
before_outline = outlines[i - 2]
|
29
|
+
previous_outline = outlines[i - 1]
|
30
|
+
next_outline = outlines[i + 1]
|
31
|
+
|
32
|
+
if previous_outline && next_outline
|
33
|
+
zebra_1 = DL.relative(previous_outline, next_outline)
|
34
|
+
zebra_2 = DL.relative(outline, before_outline)
|
35
|
+
# zebra should be close to 0.0
|
36
|
+
errors << (zebra_1 + zebra_2) / 2
|
37
|
+
end
|
38
|
+
end
|
39
|
+
avg_error = errors.inject(:+) / errors.size
|
40
|
+
stride1_certainty = 1 - avg_error
|
41
|
+
if stride1_certainty > stride0_certainty
|
42
|
+
{stride: 1, certainty: stride1_certainty}
|
43
|
+
else
|
44
|
+
{stride: 0, certainty: stride0_certainty}
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def unzebrify(result)
|
49
|
+
result.each_with_index do |list, r|
|
50
|
+
nodes = list[:items].map{ |item| item[:node] }
|
51
|
+
zebra = zebra_list?(nodes)
|
52
|
+
|
53
|
+
if zebra[:stride] == 1
|
54
|
+
puts 'ZEBRA!'
|
55
|
+
# Merge nodes
|
56
|
+
#list[:items].each_with_index do |item, i|
|
57
|
+
# list[:items][i+1][:node] = [
|
58
|
+
# list[:items][i][:node],
|
59
|
+
# list[:items][i+1][:node]
|
60
|
+
# ]
|
61
|
+
# list[:items].delete_at(i)
|
62
|
+
#end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
result
|
66
|
+
end
|
67
|
+
|
68
|
+
def lists(html)
|
69
|
+
depth = html.deepest_level
|
70
|
+
result = []
|
71
|
+
|
72
|
+
depth.times do |level|
|
73
|
+
xpath = '/*' * (level + 1)
|
74
|
+
nodes = html.xpath(xpath)
|
75
|
+
items = []
|
76
|
+
last_node = nodes.first
|
77
|
+
nodes.each do |node|
|
78
|
+
next unless [:li, :div, :tr].include? node.name.to_sym
|
79
|
+
|
80
|
+
if items.last && items.last[:node].name != node.name
|
81
|
+
# Store items as collection when 2 or more found
|
82
|
+
if items.count > 1
|
83
|
+
result << {
|
84
|
+
level: level,
|
85
|
+
items: items }
|
86
|
+
end
|
87
|
+
items = []
|
88
|
+
end
|
89
|
+
|
90
|
+
# Node is same element family as previous node
|
91
|
+
# But it could still be part of a one multiple node item
|
92
|
+
if items.last
|
93
|
+
# Is current node different from the previous one?
|
94
|
+
if items.last[:node].to_outline != node.to_outline
|
95
|
+
# Next node outline same as previous node outline?
|
96
|
+
end
|
97
|
+
end
|
98
|
+
items << {
|
99
|
+
titles: Jules::Miners.titles(node),
|
100
|
+
node: node,
|
101
|
+
text: node.text
|
102
|
+
}
|
103
|
+
end
|
104
|
+
if items.count > 1
|
105
|
+
result << {
|
106
|
+
level: level,
|
107
|
+
items: items }
|
108
|
+
end
|
109
|
+
end
|
110
|
+
unzebrify(result)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Jules
|
2
|
+
module Miners
|
3
|
+
class << self
|
4
|
+
def titles(html)
|
5
|
+
titles = []
|
6
|
+
|
7
|
+
10.times do |i|
|
8
|
+
level = i + 1
|
9
|
+
html.xpath('.//h' + level.to_s).each do |title|
|
10
|
+
name = title.text
|
11
|
+
titles << Jules::Abstractions::Title.new(level, name)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
titles
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# Add some sugar to Nokogiri
|
2
|
+
# http://stackoverflow.com/questions/7176094/how-do-i-create-an-outline-of-the-html-tag-structure-on-the-page-using-nokogiri
|
3
|
+
# http://stackoverflow.com/questions/5694759/how-do-you-calculate-the-number-of-levels-of-descendants-of-a-nokogiri-node
|
4
|
+
class Nokogiri::XML::Node
|
5
|
+
def to_outline
|
6
|
+
children.find_all(&:element?).map(&:to_outline).join
|
7
|
+
end
|
8
|
+
def depth
|
9
|
+
ancestors.size
|
10
|
+
# The following is ~10x slower: xpath('count(ancestor::node())').to_i
|
11
|
+
end
|
12
|
+
def leaves
|
13
|
+
xpath('.//*[not(*)]').to_a
|
14
|
+
end
|
15
|
+
def height
|
16
|
+
tallest = leaves.map{ |leaf| leaf.depth }.max
|
17
|
+
tallest ? tallest - depth : 0
|
18
|
+
end
|
19
|
+
def deepest_leaves
|
20
|
+
by_height = leaves.group_by{ |leaf| leaf.depth }
|
21
|
+
by_height[ by_height.keys.max ]
|
22
|
+
end
|
23
|
+
def deepest_level
|
24
|
+
by_height = leaves.group_by{ |leaf| leaf.depth }
|
25
|
+
by_height.keys.max
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
class Nokogiri::XML::Element
|
30
|
+
def to_outline
|
31
|
+
"<#{name}>#{super}</#{name}>"
|
32
|
+
end
|
33
|
+
end
|
metadata
ADDED
@@ -0,0 +1,111 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: jules
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Bart Olsthoorn
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-05-06 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: whatlanguage
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: damerau-levenshtein
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rspec
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
description: High level data mining scraper using patterns, semantics and NLP.
|
70
|
+
email:
|
71
|
+
- bartolsthoorn@gmail.com
|
72
|
+
executables: []
|
73
|
+
extensions: []
|
74
|
+
extra_rdoc_files: []
|
75
|
+
files:
|
76
|
+
- LICENSE
|
77
|
+
- README.md
|
78
|
+
- lib/damerau_levenshtein/sugar.rb
|
79
|
+
- lib/jules.rb
|
80
|
+
- lib/jules/abstractions/list.rb
|
81
|
+
- lib/jules/abstractions/title.rb
|
82
|
+
- lib/jules/document.rb
|
83
|
+
- lib/jules/miners/lists.rb
|
84
|
+
- lib/jules/miners/titles.rb
|
85
|
+
- lib/jules/version.rb
|
86
|
+
- lib/nokogiri/sugar.rb
|
87
|
+
homepage: http://github.com/bartolsthoorn/jules
|
88
|
+
licenses:
|
89
|
+
- MIT
|
90
|
+
metadata: {}
|
91
|
+
post_install_message:
|
92
|
+
rdoc_options: []
|
93
|
+
require_paths:
|
94
|
+
- lib
|
95
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
96
|
+
requirements:
|
97
|
+
- - ">="
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: '2.0'
|
100
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
101
|
+
requirements:
|
102
|
+
- - ">="
|
103
|
+
- !ruby/object:Gem::Version
|
104
|
+
version: '0'
|
105
|
+
requirements: []
|
106
|
+
rubyforge_project:
|
107
|
+
rubygems_version: 2.2.2
|
108
|
+
signing_key:
|
109
|
+
specification_version: 4
|
110
|
+
summary: High level data mining scraper using patterns, semantics and NLP.
|
111
|
+
test_files: []
|