jules 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: f646200a27845170536bad15df78cd74f9a5fca4
4
+ data.tar.gz: c456c90b2e122bb9598323b751e6066671c21f1a
5
+ SHA512:
6
+ metadata.gz: 5ab95e69f9e86dc1090bc9cf96be3439f00cd6af2a69225c100851ad54a53c830761e7b597832bd771d6b9820a4383145e56360266ac02a2f1518849bf7c314e
7
+ data.tar.gz: 041c7c4b5135becebb8d4e925a0f7035e27bf7ec3b9b806c8c43e3ddaa8aecc33a204841538471bc09bbb07004e1c2c6afdb8fb8f7b536eac4af3edfa476db82
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Bart Olsthoorn, website: bartolsthoorn.nl
2
+
3
+ Permission is hereby granted, free of charge, to any person
4
+ obtaining a copy of this software and associated documentation
5
+ files (the "Software"), to deal in the Software without
6
+ restriction, including without limitation the rights to use,
7
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
8
+ copies of the Software, and to permit persons to whom the
9
+ Software is furnished to do so, subject to the following
10
+ conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
19
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
20
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22
+ OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,40 @@
1
+ **Hi!** - Don't use this gem _yet_, it's still very much being developed.
2
+
3
+ # Jules
4
+ A data mining scraper with a high level of abstraction. It's capable of finding _lists_, _menus_, _titles_ and _contact data_.
5
+
6
+ Jules uses semantics, patterns and NLP to find data, so you don't have to specify exactly where it is. You'll no longer have to make different scrapers for every new website you want to scrape.
7
+
8
+ ~~~ruby
9
+ gem 'jules'
10
+ ~~~
11
+
12
+ ## Examples
13
+ The following examples show you how to use Jules.
14
+ ### Lists
15
+ ~~~ruby
16
+ html = File.open('web-page.html', 'rb') { |f| f.read }
17
+ j = Jules::HTML(html)
18
+ lists = j.lists
19
+ ~~~
20
+
21
+ The following example gets lists only when they contain certain data types.
22
+ ~~~ruby
23
+ j = Jules::HTML(html)
24
+ lists = j.lists(
25
+ required: [:date, :price],
26
+ optional: [:download_link]
27
+ )
28
+ ~~~
29
+
30
+ ### Jules Abstractions
31
+ - Lists
32
+ - Titles
33
+ - Menus
34
+
35
+ ### Jules Data Types
36
+ - Date *:date*
37
+ - Price *:price*
38
+ - Filesize *:filesize*
39
+ - Download url *:download_url*
40
+ - Telephone number *:telephone_number*
@@ -0,0 +1,9 @@
1
+ module DamerauLevenshtein
2
+ # returns 1.0 for completely different strings
3
+ # returns 0.0 for completely identical strings
4
+ def self.relative(a, b)
5
+ length = [a.length, b.length].max
6
+ return DamerauLevenshtein.distance(a, b).to_f / length
7
+ end
8
+ end
9
+ DL = DamerauLevenshtein
@@ -0,0 +1,25 @@
1
+ require 'jules/version'
2
+
3
+ require 'nokogiri'
4
+ require 'whatlanguage'
5
+ require 'damerau-levenshtein'
6
+
7
+ require 'nokogiri/sugar'
8
+ require 'damerau_levenshtein/sugar'
9
+
10
+ require 'jules/abstractions/list'
11
+ require 'jules/abstractions/title'
12
+
13
+ require 'jules/miners/titles'
14
+ require 'jules/miners/lists'
15
+
16
+ require 'jules/document'
17
+
18
+ module Jules
19
+ NAME = 'Jules'
20
+ LICENSE = 'See LICENSE for licensing details.'
21
+
22
+ def self.❨╯°□°❩╯︵┻━┻
23
+ puts 'Calm down, bro'
24
+ end
25
+ end
@@ -0,0 +1,10 @@
1
+ module Jules
2
+ module Abstractions
3
+ class List
4
+ attr_accessor :title, :content
5
+
6
+ def initialize(title, content)
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,21 @@
1
+ module Jules
2
+ module Abstractions
3
+ class Title
4
+ attr_accessor :level, :text, :language
5
+
6
+ def initialize(level, text)
7
+ raise ArgumentError if level.class != Fixnum
8
+ raise ArgumentError if text.class != String
9
+
10
+ # H1 means level 1, etc.
11
+ @level = level
12
+
13
+ # Name contains the actual title data
14
+ @text = text
15
+
16
+ # Language detection
17
+ @language = text.language
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,23 @@
1
+ module Jules
2
+ class << self
3
+ def HTML(html, options = {})
4
+ raise ArgumentError if html.class != String
5
+ Jules::Document.new html
6
+ end
7
+ end
8
+
9
+ class Document
10
+ attr_accessor :html
11
+
12
+ def initialize(html)
13
+ @html = Nokogiri::HTML::Document.parse html
14
+ end
15
+
16
+ def titles
17
+ Jules::Miners.titles @html
18
+ end
19
+ def lists
20
+ Jules::Miners.lists @html
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,114 @@
1
+ module Jules
2
+ module Miners
3
+ class << self
4
+ def zebra_list?(list_items)
5
+ # Use outlines to see if lists are structured like
6
+ # AAAAAA (stride 0)
7
+ # ABABAB (stride 1)
8
+ # AABBAABB (stride 2)
9
+ # AAABBBAAABBB (stride 3)
10
+
11
+ outlines = list_items.map(&:to_outline)
12
+
13
+ # First test for non_zebra AAAAA
14
+ errors = []
15
+ outlines.each_with_index do |outline, i|
16
+ previous_outline = outlines[i - 1]
17
+ errors << DL.relative(outline, previous_outline)
18
+ end
19
+ avg_error = errors.inject(:+) / errors.size
20
+ stride0_certainty = 1 - avg_error
21
+ if stride0_certainty == 1.0
22
+ return {stride: 0, certainty: stride0_certainty }
23
+ end
24
+
25
+ # Not certain it's AAAA, so continue to check for ABABAB
26
+ errors = []
27
+ outlines.each_with_index do |outline, i|
28
+ before_outline = outlines[i - 2]
29
+ previous_outline = outlines[i - 1]
30
+ next_outline = outlines[i + 1]
31
+
32
+ if previous_outline && next_outline
33
+ zebra_1 = DL.relative(previous_outline, next_outline)
34
+ zebra_2 = DL.relative(outline, before_outline)
35
+ # zebra should be close to 0.0
36
+ errors << (zebra_1 + zebra_2) / 2
37
+ end
38
+ end
39
+ avg_error = errors.inject(:+) / errors.size
40
+ stride1_certainty = 1 - avg_error
41
+ if stride1_certainty > stride0_certainty
42
+ {stride: 1, certainty: stride1_certainty}
43
+ else
44
+ {stride: 0, certainty: stride0_certainty}
45
+ end
46
+ end
47
+
48
+ def unzebrify(result)
49
+ result.each_with_index do |list, r|
50
+ nodes = list[:items].map{ |item| item[:node] }
51
+ zebra = zebra_list?(nodes)
52
+
53
+ if zebra[:stride] == 1
54
+ puts 'ZEBRA!'
55
+ # Merge nodes
56
+ #list[:items].each_with_index do |item, i|
57
+ # list[:items][i+1][:node] = [
58
+ # list[:items][i][:node],
59
+ # list[:items][i+1][:node]
60
+ # ]
61
+ # list[:items].delete_at(i)
62
+ #end
63
+ end
64
+ end
65
+ result
66
+ end
67
+
68
+ def lists(html)
69
+ depth = html.deepest_level
70
+ result = []
71
+
72
+ depth.times do |level|
73
+ xpath = '/*' * (level + 1)
74
+ nodes = html.xpath(xpath)
75
+ items = []
76
+ last_node = nodes.first
77
+ nodes.each do |node|
78
+ next unless [:li, :div, :tr].include? node.name.to_sym
79
+
80
+ if items.last && items.last[:node].name != node.name
81
+ # Store items as collection when 2 or more found
82
+ if items.count > 1
83
+ result << {
84
+ level: level,
85
+ items: items }
86
+ end
87
+ items = []
88
+ end
89
+
90
+ # Node is same element family as previous node
91
+ # But it could still be part of a one multiple node item
92
+ if items.last
93
+ # Is current node different from the previous one?
94
+ if items.last[:node].to_outline != node.to_outline
95
+ # Next node outline same as previous node outline?
96
+ end
97
+ end
98
+ items << {
99
+ titles: Jules::Miners.titles(node),
100
+ node: node,
101
+ text: node.text
102
+ }
103
+ end
104
+ if items.count > 1
105
+ result << {
106
+ level: level,
107
+ items: items }
108
+ end
109
+ end
110
+ unzebrify(result)
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,19 @@
1
+ module Jules
2
+ module Miners
3
+ class << self
4
+ def titles(html)
5
+ titles = []
6
+
7
+ 10.times do |i|
8
+ level = i + 1
9
+ html.xpath('.//h' + level.to_s).each do |title|
10
+ name = title.text
11
+ titles << Jules::Abstractions::Title.new(level, name)
12
+ end
13
+ end
14
+
15
+ titles
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,3 @@
1
+ module Jules
2
+ VERSION = '0.0.1'
3
+ end
@@ -0,0 +1,33 @@
1
+ # Add some sugar to Nokogiri
2
+ # http://stackoverflow.com/questions/7176094/how-do-i-create-an-outline-of-the-html-tag-structure-on-the-page-using-nokogiri
3
+ # http://stackoverflow.com/questions/5694759/how-do-you-calculate-the-number-of-levels-of-descendants-of-a-nokogiri-node
4
+ class Nokogiri::XML::Node
5
+ def to_outline
6
+ children.find_all(&:element?).map(&:to_outline).join
7
+ end
8
+ def depth
9
+ ancestors.size
10
+ # The following is ~10x slower: xpath('count(ancestor::node())').to_i
11
+ end
12
+ def leaves
13
+ xpath('.//*[not(*)]').to_a
14
+ end
15
+ def height
16
+ tallest = leaves.map{ |leaf| leaf.depth }.max
17
+ tallest ? tallest - depth : 0
18
+ end
19
+ def deepest_leaves
20
+ by_height = leaves.group_by{ |leaf| leaf.depth }
21
+ by_height[ by_height.keys.max ]
22
+ end
23
+ def deepest_level
24
+ by_height = leaves.group_by{ |leaf| leaf.depth }
25
+ by_height.keys.max
26
+ end
27
+ end
28
+
29
+ class Nokogiri::XML::Element
30
+ def to_outline
31
+ "<#{name}>#{super}</#{name}>"
32
+ end
33
+ end
metadata ADDED
@@ -0,0 +1,111 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: jules
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Bart Olsthoorn
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-05-06 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: whatlanguage
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: damerau-levenshtein
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ description: High level data mining scraper using patterns, semantics and NLP.
70
+ email:
71
+ - bartolsthoorn@gmail.com
72
+ executables: []
73
+ extensions: []
74
+ extra_rdoc_files: []
75
+ files:
76
+ - LICENSE
77
+ - README.md
78
+ - lib/damerau_levenshtein/sugar.rb
79
+ - lib/jules.rb
80
+ - lib/jules/abstractions/list.rb
81
+ - lib/jules/abstractions/title.rb
82
+ - lib/jules/document.rb
83
+ - lib/jules/miners/lists.rb
84
+ - lib/jules/miners/titles.rb
85
+ - lib/jules/version.rb
86
+ - lib/nokogiri/sugar.rb
87
+ homepage: http://github.com/bartolsthoorn/jules
88
+ licenses:
89
+ - MIT
90
+ metadata: {}
91
+ post_install_message:
92
+ rdoc_options: []
93
+ require_paths:
94
+ - lib
95
+ required_ruby_version: !ruby/object:Gem::Requirement
96
+ requirements:
97
+ - - ">="
98
+ - !ruby/object:Gem::Version
99
+ version: '2.0'
100
+ required_rubygems_version: !ruby/object:Gem::Requirement
101
+ requirements:
102
+ - - ">="
103
+ - !ruby/object:Gem::Version
104
+ version: '0'
105
+ requirements: []
106
+ rubyforge_project:
107
+ rubygems_version: 2.2.2
108
+ signing_key:
109
+ specification_version: 4
110
+ summary: High level data mining scraper using patterns, semantics and NLP.
111
+ test_files: []