abrupt 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (84) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +22 -0
  3. data/.rubocop.yml +16 -0
  4. data/.travis.yml +34 -0
  5. data/Gemfile +4 -0
  6. data/Guardfile +51 -0
  7. data/LICENSE.txt +22 -0
  8. data/README.md +36 -0
  9. data/Rakefile +7 -0
  10. data/abrupt.gemspec +41 -0
  11. data/assets/rules/datatypes/cax-RequiredFormElement.ttl +34 -0
  12. data/assets/rules/datatypes/cax-readability.ttl +18 -0
  13. data/assets/rules/datatypes/cax-required.ttl +15 -0
  14. data/assets/rules/list/prp-hasState.ttl +10 -0
  15. data/assets/rules/production/non_required_form_element.ttl +24 -0
  16. data/assets/rules/production/state_has_no_html_element.ttl +21 -0
  17. data/assets/schema/schema.json +49 -0
  18. data/assets/schema/v1/complexity.json +142 -0
  19. data/assets/schema/v1/input.json +1136 -0
  20. data/assets/schema/v1/link.json +41 -0
  21. data/assets/schema/v1/picture.json +47 -0
  22. data/assets/schema/v1/readability.json +51 -0
  23. data/assets/schema/v1/subject.json +88 -0
  24. data/assets/voc/tbox.ttl +1632 -0
  25. data/bin/abrupt +63 -0
  26. data/doc/paper/listings/datatype_rule.ttl +0 -0
  27. data/doc/paper/listings/description_logic_infered.ttl +3 -0
  28. data/doc/paper/listings/description_logic_rule.ttl +15 -0
  29. data/doc/paper/listings/inconsistency_rule.ttl +0 -0
  30. data/doc/paper/listings/limitations.ttl +10 -0
  31. data/doc/paper/listings/production_rule.ttl +0 -0
  32. data/doc/paper/listings/propositional_logic_infered.ttl +6 -0
  33. data/doc/paper/listings/propositional_logic_rule.ttl +15 -0
  34. data/doc/paper/listings/unique_nested_uris.ttl +10 -0
  35. data/doc/paper/literature.bib +56 -0
  36. data/doc/paper/main.tex +322 -0
  37. data/doc/poster/Poster.key +0 -0
  38. data/doc/poster/Poster.pdf +0 -0
  39. data/doc/poster/poster.indd +0 -0
  40. data/doc/poster/resources/graph.graffle +0 -0
  41. data/doc/poster/resources/graph.png +0 -0
  42. data/doc/poster/resources/graph_crop.png +0 -0
  43. data/lib/abrupt.rb +90 -0
  44. data/lib/abrupt/converter.rb +130 -0
  45. data/lib/abrupt/crawler.rb +125 -0
  46. data/lib/abrupt/service/absolute_url.rb +32 -0
  47. data/lib/abrupt/service/base.rb +75 -0
  48. data/lib/abrupt/service/complexity.rb +27 -0
  49. data/lib/abrupt/service/input.rb +15 -0
  50. data/lib/abrupt/service/link.rb +15 -0
  51. data/lib/abrupt/service/picture.rb +19 -0
  52. data/lib/abrupt/service/readability.rb +26 -0
  53. data/lib/abrupt/service/subject.rb +19 -0
  54. data/lib/abrupt/transformation/base.rb +145 -0
  55. data/lib/abrupt/transformation/client/base.rb +8 -0
  56. data/lib/abrupt/transformation/client/page_view.rb +27 -0
  57. data/lib/abrupt/transformation/client/visit.rb +56 -0
  58. data/lib/abrupt/transformation/client/visitor.rb +19 -0
  59. data/lib/abrupt/transformation/website/base.rb +8 -0
  60. data/lib/abrupt/transformation/website/complexity.rb +20 -0
  61. data/lib/abrupt/transformation/website/input.rb +42 -0
  62. data/lib/abrupt/transformation/website/link.rb +27 -0
  63. data/lib/abrupt/transformation/website/picture.rb +26 -0
  64. data/lib/abrupt/transformation/website/readability.rb +15 -0
  65. data/lib/abrupt/transformation/website/subject.rb +22 -0
  66. data/lib/abrupt/version.rb +7 -0
  67. data/spec/cassettes/Abrupt_Crawler/outputs_correct_hash.yml +91250 -0
  68. data/spec/converter_spec.rb +34 -0
  69. data/spec/crawler_spec.rb +11 -0
  70. data/spec/factories/crawled_hashes.rb +468 -0
  71. data/spec/fixtures/rikscha-mainz.owl +17456 -0
  72. data/spec/fixtures/rikscha.ohneBilder.2013-04-30_2013-08-17.xml +51759 -0
  73. data/spec/fixtures/rikscha.ohneBilder.2013-04-30_2013-08-17_min.xml +81 -0
  74. data/spec/fixtures/rikscha_Result.xml +11594 -0
  75. data/spec/fixtures/rikscha_Result_min.xml +574 -0
  76. data/spec/spec_helper.rb +26 -0
  77. data/spec/transformation/base_spec.rb +18 -0
  78. data/spec/transformation/website/complexity_spec.rb +188 -0
  79. data/spec/transformation/website/input_spec.rb +181 -0
  80. data/spec/transformation/website/link_spec.rb +13 -0
  81. data/spec/transformation/website/picture_spec.rb +20 -0
  82. data/spec/transformation/website/readability_spec.rb +22 -0
  83. data/spec/transformation/website/subject_spec.rb +40 -0
  84. metadata +424 -0
Binary file
Binary file
Binary file
@@ -0,0 +1,90 @@
1
+ # @author Manuel Dudda
2
+ Dir[File.dirname(__FILE__) + '/abrupt/*.rb'].each do |file|
3
+ require file
4
+ end
5
+ require 'pp'
6
+
7
+ # Extension for String class
8
+ class String
9
+ def remove_last_slashes
10
+ gsub(/([\/]*)$/, '')
11
+ end
12
+
13
+ def append_last_slash
14
+ gsub(/([^\/])$/, '\1/')
15
+ end
16
+ end
17
+
18
+ # Extension for all objects
19
+ class Object
20
+ def ensure_to_a
21
+ [self].flatten.compact
22
+ end
23
+ end
24
+
25
+ # This module is cool
26
+ # @abstract
27
+ module Abrupt
28
+ VOC = RDF::Vocabulary.new('http://wba.cs.hs-rm.de/AbRUPt/')
29
+ VOC_FILE = File.join File.dirname(__dir__), 'assets', 'voc', 'tbox.ttl'
30
+ RULES_DIR = File.join File.dirname(__dir__), 'assets', 'rules', '*'
31
+ DELIMITER = '/'
32
+ PREFIXES = {
33
+ abrupt: VOC.to_s,
34
+ rdf: RDF.to_s,
35
+ rdfs: RDF::RDFS.to_s,
36
+ xsd: RDF::XSD.to_s,
37
+ owl: RDF::OWL.to_s
38
+ }
39
+
40
+ TIME_INPUT_FORMAT = '%d/%b/%Y:%H:%M:%S'
41
+ TIME_OUTPUT_FORMAT = '%Y-%m-%d_%H%M%S'
42
+
43
+ def self.parse_time(time)
44
+ DateTime.strptime(time, TIME_INPUT_FORMAT)
45
+ end
46
+
47
+ def self.format_time(time)
48
+ parse_time(time).strftime(TIME_OUTPUT_FORMAT)
49
+ end
50
+
51
+ def self.root
52
+ File.dirname __dir__
53
+ end
54
+
55
+ def self.log(msg)
56
+ print msg
57
+ end
58
+
59
+ def self.crawl(uri, *args)
60
+ opts = args.first
61
+ crawler = Abrupt::Crawler.new uri, opts
62
+ start_time = Time.now
63
+ log "begin: #{start_time}\n"
64
+ result = crawler.crawl
65
+ end_time = Time.now
66
+ log "\nfinished in #{(end_time - start_time).round} sec.\n\n"
67
+ case opts[:format]
68
+ when 'xml'
69
+ puts Converter.xml(result)
70
+ else # owl as default
71
+ puts Converter.owl(result)
72
+ end
73
+ end
74
+
75
+ def self.convert(file, *args)
76
+ converter = Converter.instance
77
+ assertions = args.last[:assertions].split ','
78
+ converter.init(args[1]) # options
79
+ append file, args.first, assertions
80
+ converter.result
81
+ end
82
+
83
+ def self.append(file, user_file, assertions)
84
+ converter = Converter.instance
85
+ converter.append_tbox if assertions.include?('tbox')
86
+ converter.append_website_data(file) if assertions.include?('website')
87
+ converter.append_user_data(user_file) if assertions.include?('user')
88
+ converter.append_rules if assertions.include?('rules')
89
+ end
90
+ end
@@ -0,0 +1,130 @@
1
+ # @author Manuel Dudda
2
+ require 'singleton'
3
+ require 'rest_client'
4
+ require 'gyoku'
5
+ require 'rdf'
6
+ require 'linkeddata'
7
+ require 'active_support'
8
+ require 'active_support/core_ext'
9
+ Dir[File.dirname(__FILE__) + '/transformation/*.rb',
10
+ File.dirname(__FILE__) + '/transformation/website/*.rb',
11
+ File.dirname(__FILE__) + '/transformation/client/*.rb'].each do |file|
12
+ require file
13
+ end
14
+
15
+ # Abrupt Converter
16
+ module Abrupt
17
+ # Converter
18
+ class Converter
19
+ include Singleton
20
+ include RDF
21
+ attr_accessor :hsh, :values, :result, :format, :uri
22
+
23
+ def init(options = {})
24
+ @format = options[:format].try(:to_sym) || :turtle
25
+ @result = Repository.new
26
+ end
27
+
28
+ def append_tbox
29
+ @result << Repository.load(VOC_FILE)
30
+ end
31
+
32
+ def append_website_data(hsh)
33
+ init_hsh(hsh)
34
+ @uri = URI(@hsh[:website][:domain])
35
+ init_website
36
+ perform
37
+ end
38
+
39
+ def init_website
40
+ domain = RDF::URI("#{VOC}Website/#{@uri}")
41
+ @result << Statement.new(domain, RDF.type, VOC.Website)
42
+ @result << Statement.new(domain, VOC.hostName, @uri.host)
43
+ end
44
+
45
+ def init_hsh(hsh)
46
+ hsh = Hash.from_xml(File.read(hsh)) unless hsh.is_a?(Hash)
47
+ @hsh = hsh.deep_symbolize_keys
48
+ return unless @hsh[:website]
49
+ @hsh[:website][:url].each_with_index do |value, i|
50
+ Transformation::Website::Base.subclasses.each do |transformation_class|
51
+ @hsh[:website][:url][i] =
52
+ transformation_class.customize_to_schema(value)
53
+ end
54
+ end
55
+ end
56
+
57
+ def self.xml(hsh)
58
+ Gyoku.xml hsh
59
+ end
60
+
61
+ def self.json(hsh)
62
+ hsh.to_json
63
+ end
64
+
65
+ def add_to_result(statements)
66
+ statements.each { |stmt| @result << stmt }
67
+ end
68
+
69
+ def perform
70
+ website = ['Website', @uri.to_s]
71
+ @hsh[:website][:url].each do |url|
72
+ page = ['Page', url[:name].append_last_slash]
73
+ page_transformator = Transformation::Base.new(website, page)
74
+ add_to_result page_transformator.add_individuals # add Page
75
+ next unless url[:state]
76
+ perform_states url[:state], website + page
77
+ end
78
+ end
79
+
80
+ def perform_states(states, parent_uri)
81
+ states = states.is_a?(Array) ? states : [states]
82
+ states.each do |value|
83
+ state = ['State', value[:name]]
84
+ # MAYBE empty?
85
+ add_to_result Transformation::Base.new(parent_uri, state).result
86
+ Transformation::Website::Base.subclasses.each do |transformation_class|
87
+ t = transformation_class.new(parent_uri + state, nil, value)
88
+ add_to_result t.add_individuals
89
+ end
90
+ end
91
+ end
92
+
93
+ def append_user_data(file)
94
+ return unless file.is_a?(String) && File.exist?(file)
95
+ xml = Hash.from_xml(File.read(file)).deep_symbolize_keys
96
+ xml[:database][:visitor].ensure_to_a.each do |values|
97
+ ip = values[:ip]
98
+ next unless ip
99
+ visitor = Transformation::Client::Visitor.new(
100
+ ['Website', @uri.to_s], ['Visitor', ip], values)
101
+ add_to_result visitor.add_individuals
102
+ append_pages_for_visitor(visitor)
103
+ end
104
+ @result
105
+ end
106
+
107
+ def append_pages_for_visitor(visitor)
108
+ pages = visitor.values[:pages][:page].ensure_to_a
109
+ pages.each do |page|
110
+ time = ::Abrupt.format_time(page[:entertime])
111
+ Transformation::Client::Base.subclasses.each do |transformation_class|
112
+ transformator = transformation_class.new(
113
+ visitor.parent_uri + visitor.uri,
114
+ ['Visit', time], page
115
+ )
116
+ add_to_result transformator.add_individuals
117
+ end
118
+ end
119
+ end
120
+
121
+ def append_rules
122
+ Dir.glob(RULES_DIR).each do |rule_directory|
123
+ Dir.glob(File.join(rule_directory, '*')).each do |rule_file|
124
+ rule = Repository.load(rule_file)
125
+ add_to_result(rule.statements)
126
+ end
127
+ end
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,125 @@
1
+ # @author Manuel Dudda
2
+ require 'rest_client'
3
+ require 'addressable/uri'
4
+ %w( base
5
+ readability
6
+ subject
7
+ input
8
+ complexity
9
+ picture
10
+ link
11
+ absolute_url).each do |f|
12
+ require_relative "service/#{f}"
13
+ end
14
+ module Abrupt
15
+ # Crawler for a website including all followed urls
16
+ # with performing abrupt services
17
+ # BETA!!!
18
+ class Crawler
19
+ SERVICE_MAPPING = {
20
+ r: Service::Readability,
21
+ i: Service::Input,
22
+ s: Service::Subject,
23
+ c: Service::Complexity,
24
+ l: Service::Link,
25
+ p: Service::Picture
26
+ }
27
+
28
+ def initialize(uri, *args)
29
+ @uri = Addressable::URI.parse(uri).normalize
30
+ opts = args.first
31
+ @options = {
32
+ lang: 'en',
33
+ services: %w(r i s c l p),
34
+ depth: '3',
35
+ word_limit: 20
36
+ }
37
+ @options[:services] = opts[:services] if opts[:services]
38
+ @options[:lang] = opts[:lang] if opts[:lang]
39
+ @follow_links = !opts[:nofollow]
40
+ @result = {}
41
+ end
42
+
43
+ # Crawls a page, saves the service results in result hash
44
+ # and returns an array with the existing uris of this page.
45
+ #
46
+ # @param uri [String] the uri to crawl
47
+ # @return [JSON] result
48
+ def crawl(uri = nil)
49
+ Abrupt.log '.'
50
+ uri ||= @uri.to_str.append_last_slash
51
+ unless @result[uri]
52
+ html = fetch_html(uri)
53
+ @result[uri] ||= {}
54
+ @result[uri] = perform_services(html) if html
55
+ # new_uris.select! { |url| same_host?(url) } # filter
56
+ uris_with_same_host(uri).uniq.each { |url| crawl(url) } if @follow_links
57
+ end
58
+ Service::Base.transform_hash(@result)
59
+ end
60
+
61
+ # TODO: maybe as class method
62
+ def uris_with_same_host(uri)
63
+ if @result[uri][:link] && @result[uri][:link]['a']
64
+ @result[uri][:link]['a'].to_a.map do |link|
65
+ link['href'] if same_host?(link['href'])
66
+ end.compact
67
+ else
68
+ []
69
+ end
70
+ end
71
+
72
+ def fetch_html(uri)
73
+ uri = Addressable::URI.parse(uri.strip).normalize.to_str
74
+ begin
75
+ response = ::RestClient.get uri, accept: :html
76
+ content_type = response.headers[:content_type].to_s
77
+ case response.code
78
+ when 200...400
79
+ response.to_str if html?(content_type)
80
+ else
81
+ false
82
+ end
83
+ rescue => e
84
+ puts "error fetching html on #{uri}"
85
+ puts e
86
+ nil
87
+ end
88
+ end
89
+
90
+ def html?(content_type)
91
+ content_type.start_with?('text/html')
92
+ end
93
+
94
+ def same_host?(uri)
95
+ !uri.to_s.empty? && Addressable::URI.parse(uri).host.eql?(@uri.host)
96
+ end
97
+
98
+ def init_services_hash(html)
99
+ @options[:services].map do |s|
100
+ s = s.to_sym
101
+ service_class = SERVICE_MAPPING[s]
102
+ available_options = service_class.available_options
103
+ opts = available_options.map { |o| [o, @options[o.to_sym]] }.to_h
104
+ service = service_class.new(html, opts)
105
+ [service_class.keyname, service]
106
+ end.to_h
107
+ end
108
+
109
+ def canonize_html(html)
110
+ baseurl = "#{@uri.scheme}://#{@uri.host}"
111
+ converter = Service::AbsoluteUrl.new(html, baseurl: baseurl)
112
+ converter.execute
113
+ end
114
+
115
+ def perform_services(html)
116
+ result = {}
117
+ html = canonize_html(html)
118
+ services_hash = init_services_hash(html)
119
+ services_hash.each do |json_field, service_class|
120
+ result[json_field.to_sym] = service_class.execute
121
+ end
122
+ result
123
+ end
124
+ end
125
+ end
@@ -0,0 +1,32 @@
1
+ # @author Manuel Dudda
2
+ module Abrupt
3
+ module Service
4
+ # Complexity service
5
+ # documentation see 'http://wba.cs.hs-rm.de/AbRUPt/service/absoluteurl/'
6
+ class AbsoluteUrl < Base
7
+ # TODO: outsource service uri to module Service
8
+ SERVICE_URI = 'http://wba.cs.hs-rm.de/AbRUPt/service/absoluteurl/'
9
+
10
+ def service_uri
11
+ SERVICE_URI
12
+ end
13
+
14
+ def execute
15
+ options = {
16
+ method: :post,
17
+ timeout: 6000,
18
+ open_timeout: 6000,
19
+ accept: :html
20
+ }
21
+ options.merge!(url: @url, payload: @html)
22
+ begin
23
+ RestClient::Request.execute(options).to_str
24
+ rescue => e
25
+ puts "some problems with #{@url}"
26
+ puts e
27
+ nil
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,75 @@
1
+ # @author Manuel Dudda
2
+ require 'rest_client'
3
+ module Abrupt
4
+ module Service
5
+ # base class
6
+ class Base
7
+ attr_accessor :url, :abbr, :options, :response
8
+ # TODO: outsource service uri to module Service
9
+ SERVICE_URI = 'http://wba.cs.hs-rm.de/AbRUPt/service/complexity/public/index.php/api/v1/complexity'
10
+
11
+ def service_uri
12
+ SERVICE_URI
13
+ end
14
+
15
+ def initialize(html, options = {})
16
+ @html = html
17
+ @options = options
18
+ query_params = if @options.count > 0
19
+ options_arr = @options.map { |k, v| "#{k}=#{v}" }
20
+ '?' + options_arr.reduce { |a, e| "#{a}&#{e}" }
21
+ else
22
+ ''
23
+ end
24
+ @url = service_uri + query_params
25
+ @abbr = self.class.name[0].downcase
26
+ @options = []
27
+ end
28
+
29
+ def self.available_options
30
+ []
31
+ end
32
+
33
+ def self.keyname
34
+ name.split('::').last.downcase
35
+ end
36
+
37
+ # TODO: naming of interface execute
38
+ def execute
39
+ options = {
40
+ method: :post,
41
+ timeout: 6000,
42
+ open_timeout: 6000,
43
+ accept: :schema
44
+ }
45
+ options.merge!(url: @url, payload: @html)
46
+ begin
47
+ res = RestClient::Request.execute(options).to_str
48
+ @response = JSON.parse(res)
49
+ rescue => e
50
+ puts "some problems with #{@url}"
51
+ puts e
52
+ nil
53
+ end
54
+ end
55
+
56
+ def self.transform_hash(hsh)
57
+ uri = Addressable::URI.parse(hsh.keys.first).normalize
58
+ result = {
59
+ website: {
60
+ domain: "#{uri.scheme}://#{uri.host}",
61
+ url: []
62
+ }
63
+ }
64
+ hsh.each_with_index do |(key, value), _index|
65
+ page = {
66
+ name: key,
67
+ state: value
68
+ }
69
+ result[:website][:url] << page
70
+ end
71
+ result.deep_symbolize_keys
72
+ end
73
+ end
74
+ end
75
+ end