abrupt 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +22 -0
  3. data/.rubocop.yml +16 -0
  4. data/.travis.yml +34 -0
  5. data/Gemfile +4 -0
  6. data/Guardfile +51 -0
  7. data/LICENSE.txt +22 -0
  8. data/README.md +36 -0
  9. data/Rakefile +7 -0
  10. data/abrupt.gemspec +41 -0
  11. data/assets/rules/datatypes/cax-RequiredFormElement.ttl +34 -0
  12. data/assets/rules/datatypes/cax-readability.ttl +18 -0
  13. data/assets/rules/datatypes/cax-required.ttl +15 -0
  14. data/assets/rules/list/prp-hasState.ttl +10 -0
  15. data/assets/rules/production/non_required_form_element.ttl +24 -0
  16. data/assets/rules/production/state_has_no_html_element.ttl +21 -0
  17. data/assets/schema/schema.json +49 -0
  18. data/assets/schema/v1/complexity.json +142 -0
  19. data/assets/schema/v1/input.json +1136 -0
  20. data/assets/schema/v1/link.json +41 -0
  21. data/assets/schema/v1/picture.json +47 -0
  22. data/assets/schema/v1/readability.json +51 -0
  23. data/assets/schema/v1/subject.json +88 -0
  24. data/assets/voc/tbox.ttl +1632 -0
  25. data/bin/abrupt +63 -0
  26. data/doc/paper/listings/datatype_rule.ttl +0 -0
  27. data/doc/paper/listings/description_logic_infered.ttl +3 -0
  28. data/doc/paper/listings/description_logic_rule.ttl +15 -0
  29. data/doc/paper/listings/inconsistency_rule.ttl +0 -0
  30. data/doc/paper/listings/limitations.ttl +10 -0
  31. data/doc/paper/listings/production_rule.ttl +0 -0
  32. data/doc/paper/listings/propositional_logic_infered.ttl +6 -0
  33. data/doc/paper/listings/propositional_logic_rule.ttl +15 -0
  34. data/doc/paper/listings/unique_nested_uris.ttl +10 -0
  35. data/doc/paper/literature.bib +56 -0
  36. data/doc/paper/main.tex +322 -0
  37. data/doc/poster/Poster.key +0 -0
  38. data/doc/poster/Poster.pdf +0 -0
  39. data/doc/poster/poster.indd +0 -0
  40. data/doc/poster/resources/graph.graffle +0 -0
  41. data/doc/poster/resources/graph.png +0 -0
  42. data/doc/poster/resources/graph_crop.png +0 -0
  43. data/lib/abrupt.rb +90 -0
  44. data/lib/abrupt/converter.rb +130 -0
  45. data/lib/abrupt/crawler.rb +125 -0
  46. data/lib/abrupt/service/absolute_url.rb +32 -0
  47. data/lib/abrupt/service/base.rb +75 -0
  48. data/lib/abrupt/service/complexity.rb +27 -0
  49. data/lib/abrupt/service/input.rb +15 -0
  50. data/lib/abrupt/service/link.rb +15 -0
  51. data/lib/abrupt/service/picture.rb +19 -0
  52. data/lib/abrupt/service/readability.rb +26 -0
  53. data/lib/abrupt/service/subject.rb +19 -0
  54. data/lib/abrupt/transformation/base.rb +145 -0
  55. data/lib/abrupt/transformation/client/base.rb +8 -0
  56. data/lib/abrupt/transformation/client/page_view.rb +27 -0
  57. data/lib/abrupt/transformation/client/visit.rb +56 -0
  58. data/lib/abrupt/transformation/client/visitor.rb +19 -0
  59. data/lib/abrupt/transformation/website/base.rb +8 -0
  60. data/lib/abrupt/transformation/website/complexity.rb +20 -0
  61. data/lib/abrupt/transformation/website/input.rb +42 -0
  62. data/lib/abrupt/transformation/website/link.rb +27 -0
  63. data/lib/abrupt/transformation/website/picture.rb +26 -0
  64. data/lib/abrupt/transformation/website/readability.rb +15 -0
  65. data/lib/abrupt/transformation/website/subject.rb +22 -0
  66. data/lib/abrupt/version.rb +7 -0
  67. data/spec/cassettes/Abrupt_Crawler/outputs_correct_hash.yml +91250 -0
  68. data/spec/converter_spec.rb +34 -0
  69. data/spec/crawler_spec.rb +11 -0
  70. data/spec/factories/crawled_hashes.rb +468 -0
  71. data/spec/fixtures/rikscha-mainz.owl +17456 -0
  72. data/spec/fixtures/rikscha.ohneBilder.2013-04-30_2013-08-17.xml +51759 -0
  73. data/spec/fixtures/rikscha.ohneBilder.2013-04-30_2013-08-17_min.xml +81 -0
  74. data/spec/fixtures/rikscha_Result.xml +11594 -0
  75. data/spec/fixtures/rikscha_Result_min.xml +574 -0
  76. data/spec/spec_helper.rb +26 -0
  77. data/spec/transformation/base_spec.rb +18 -0
  78. data/spec/transformation/website/complexity_spec.rb +188 -0
  79. data/spec/transformation/website/input_spec.rb +181 -0
  80. data/spec/transformation/website/link_spec.rb +13 -0
  81. data/spec/transformation/website/picture_spec.rb +20 -0
  82. data/spec/transformation/website/readability_spec.rb +22 -0
  83. data/spec/transformation/website/subject_spec.rb +40 -0
  84. metadata +424 -0
Binary file
Binary file
Binary file
@@ -0,0 +1,90 @@
1
+ # @author Manuel Dudda
2
+ Dir[File.dirname(__FILE__) + '/abrupt/*.rb'].each do |file|
3
+ require file
4
+ end
5
+ require 'pp'
6
+
7
+ # Extension for String class
8
+ class String
9
+ def remove_last_slashes
10
+ gsub(/([\/]*)$/, '')
11
+ end
12
+
13
+ def append_last_slash
14
+ gsub(/([^\/])$/, '\1/')
15
+ end
16
+ end
17
+
18
+ # Extension for all objects
19
+ class Object
20
+ def ensure_to_a
21
+ [self].flatten.compact
22
+ end
23
+ end
24
+
25
+ # This module is cool
26
+ # @abstract
27
+ module Abrupt
28
+ VOC = RDF::Vocabulary.new('http://wba.cs.hs-rm.de/AbRUPt/')
29
+ VOC_FILE = File.join File.dirname(__dir__), 'assets', 'voc', 'tbox.ttl'
30
+ RULES_DIR = File.join File.dirname(__dir__), 'assets', 'rules', '*'
31
+ DELIMITER = '/'
32
+ PREFIXES = {
33
+ abrupt: VOC.to_s,
34
+ rdf: RDF.to_s,
35
+ rdfs: RDF::RDFS.to_s,
36
+ xsd: RDF::XSD.to_s,
37
+ owl: RDF::OWL.to_s
38
+ }
39
+
40
+ TIME_INPUT_FORMAT = '%d/%b/%Y:%H:%M:%S'
41
+ TIME_OUTPUT_FORMAT = '%Y-%m-%d_%H%M%S'
42
+
43
+ def self.parse_time(time)
44
+ DateTime.strptime(time, TIME_INPUT_FORMAT)
45
+ end
46
+
47
+ def self.format_time(time)
48
+ parse_time(time).strftime(TIME_OUTPUT_FORMAT)
49
+ end
50
+
51
+ def self.root
52
+ File.dirname __dir__
53
+ end
54
+
55
+ def self.log(msg)
56
+ print msg
57
+ end
58
+
59
+ def self.crawl(uri, *args)
60
+ opts = args.first
61
+ crawler = Abrupt::Crawler.new uri, opts
62
+ start_time = Time.now
63
+ log "begin: #{start_time}\n"
64
+ result = crawler.crawl
65
+ end_time = Time.now
66
+ log "\nfinished in #{(end_time - start_time).round} sec.\n\n"
67
+ case opts[:format]
68
+ when 'xml'
69
+ puts Converter.xml(result)
70
+ else # owl as default
71
+ puts Converter.owl(result)
72
+ end
73
+ end
74
+
75
+ def self.convert(file, *args)
76
+ converter = Converter.instance
77
+ assertions = args.last[:assertions].split ','
78
+ converter.init(args[1]) # options
79
+ append file, args.first, assertions
80
+ converter.result
81
+ end
82
+
83
+ def self.append(file, user_file, assertions)
84
+ converter = Converter.instance
85
+ converter.append_tbox if assertions.include?('tbox')
86
+ converter.append_website_data(file) if assertions.include?('website')
87
+ converter.append_user_data(user_file) if assertions.include?('user')
88
+ converter.append_rules if assertions.include?('rules')
89
+ end
90
+ end
@@ -0,0 +1,130 @@
1
+ # @author Manuel Dudda
2
+ require 'singleton'
3
+ require 'rest_client'
4
+ require 'gyoku'
5
+ require 'rdf'
6
+ require 'linkeddata'
7
+ require 'active_support'
8
+ require 'active_support/core_ext'
9
+ Dir[File.dirname(__FILE__) + '/transformation/*.rb',
10
+ File.dirname(__FILE__) + '/transformation/website/*.rb',
11
+ File.dirname(__FILE__) + '/transformation/client/*.rb'].each do |file|
12
+ require file
13
+ end
14
+
15
+ # Abrupt Converter
16
+ module Abrupt
17
+ # Converter
18
+ class Converter
19
+ include Singleton
20
+ include RDF
21
+ attr_accessor :hsh, :values, :result, :format, :uri
22
+
23
+ def init(options = {})
24
+ @format = options[:format].try(:to_sym) || :turtle
25
+ @result = Repository.new
26
+ end
27
+
28
+ def append_tbox
29
+ @result << Repository.load(VOC_FILE)
30
+ end
31
+
32
+ def append_website_data(hsh)
33
+ init_hsh(hsh)
34
+ @uri = URI(@hsh[:website][:domain])
35
+ init_website
36
+ perform
37
+ end
38
+
39
+ def init_website
40
+ domain = RDF::URI("#{VOC}Website/#{@uri}")
41
+ @result << Statement.new(domain, RDF.type, VOC.Website)
42
+ @result << Statement.new(domain, VOC.hostName, @uri.host)
43
+ end
44
+
45
+ def init_hsh(hsh)
46
+ hsh = Hash.from_xml(File.read(hsh)) unless hsh.is_a?(Hash)
47
+ @hsh = hsh.deep_symbolize_keys
48
+ return unless @hsh[:website]
49
+ @hsh[:website][:url].each_with_index do |value, i|
50
+ Transformation::Website::Base.subclasses.each do |transformation_class|
51
+ @hsh[:website][:url][i] =
52
+ transformation_class.customize_to_schema(value)
53
+ end
54
+ end
55
+ end
56
+
57
+ def self.xml(hsh)
58
+ Gyoku.xml hsh
59
+ end
60
+
61
+ def self.json(hsh)
62
+ hsh.to_json
63
+ end
64
+
65
+ def add_to_result(statements)
66
+ statements.each { |stmt| @result << stmt }
67
+ end
68
+
69
+ def perform
70
+ website = ['Website', @uri.to_s]
71
+ @hsh[:website][:url].each do |url|
72
+ page = ['Page', url[:name].append_last_slash]
73
+ page_transformator = Transformation::Base.new(website, page)
74
+ add_to_result page_transformator.add_individuals # add Page
75
+ next unless url[:state]
76
+ perform_states url[:state], website + page
77
+ end
78
+ end
79
+
80
+ def perform_states(states, parent_uri)
81
+ states = states.is_a?(Array) ? states : [states]
82
+ states.each do |value|
83
+ state = ['State', value[:name]]
84
+ # MAYBE empty?
85
+ add_to_result Transformation::Base.new(parent_uri, state).result
86
+ Transformation::Website::Base.subclasses.each do |transformation_class|
87
+ t = transformation_class.new(parent_uri + state, nil, value)
88
+ add_to_result t.add_individuals
89
+ end
90
+ end
91
+ end
92
+
93
+ def append_user_data(file)
94
+ return unless file.is_a?(String) && File.exist?(file)
95
+ xml = Hash.from_xml(File.read(file)).deep_symbolize_keys
96
+ xml[:database][:visitor].ensure_to_a.each do |values|
97
+ ip = values[:ip]
98
+ next unless ip
99
+ visitor = Transformation::Client::Visitor.new(
100
+ ['Website', @uri.to_s], ['Visitor', ip], values)
101
+ add_to_result visitor.add_individuals
102
+ append_pages_for_visitor(visitor)
103
+ end
104
+ @result
105
+ end
106
+
107
+ def append_pages_for_visitor(visitor)
108
+ pages = visitor.values[:pages][:page].ensure_to_a
109
+ pages.each do |page|
110
+ time = ::Abrupt.format_time(page[:entertime])
111
+ Transformation::Client::Base.subclasses.each do |transformation_class|
112
+ transformator = transformation_class.new(
113
+ visitor.parent_uri + visitor.uri,
114
+ ['Visit', time], page
115
+ )
116
+ add_to_result transformator.add_individuals
117
+ end
118
+ end
119
+ end
120
+
121
+ def append_rules
122
+ Dir.glob(RULES_DIR).each do |rule_directory|
123
+ Dir.glob(File.join(rule_directory, '*')).each do |rule_file|
124
+ rule = Repository.load(rule_file)
125
+ add_to_result(rule.statements)
126
+ end
127
+ end
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,125 @@
1
+ # @author Manuel Dudda
2
+ require 'rest_client'
3
+ require 'addressable/uri'
4
+ %w( base
5
+ readability
6
+ subject
7
+ input
8
+ complexity
9
+ picture
10
+ link
11
+ absolute_url).each do |f|
12
+ require_relative "service/#{f}"
13
+ end
14
+ module Abrupt
15
+ # Crawler for a website including all followed urls
16
+ # with performing abrupt services
17
+ # BETA!!!
18
+ class Crawler
19
+ SERVICE_MAPPING = {
20
+ r: Service::Readability,
21
+ i: Service::Input,
22
+ s: Service::Subject,
23
+ c: Service::Complexity,
24
+ l: Service::Link,
25
+ p: Service::Picture
26
+ }
27
+
28
+ def initialize(uri, *args)
29
+ @uri = Addressable::URI.parse(uri).normalize
30
+ opts = args.first
31
+ @options = {
32
+ lang: 'en',
33
+ services: %w(r i s c l p),
34
+ depth: '3',
35
+ word_limit: 20
36
+ }
37
+ @options[:services] = opts[:services] if opts[:services]
38
+ @options[:lang] = opts[:lang] if opts[:lang]
39
+ @follow_links = !opts[:nofollow]
40
+ @result = {}
41
+ end
42
+
43
+ # Crawls a page, saves the service results in result hash
44
+ # and returns an array with the existing uris of this page.
45
+ #
46
+ # @param uri [String] the uri to crawl
47
+ # @return [JSON] result
48
+ def crawl(uri = nil)
49
+ Abrupt.log '.'
50
+ uri ||= @uri.to_str.append_last_slash
51
+ unless @result[uri]
52
+ html = fetch_html(uri)
53
+ @result[uri] ||= {}
54
+ @result[uri] = perform_services(html) if html
55
+ # new_uris.select! { |url| same_host?(url) } # filter
56
+ uris_with_same_host(uri).uniq.each { |url| crawl(url) } if @follow_links
57
+ end
58
+ Service::Base.transform_hash(@result)
59
+ end
60
+
61
+ # TODO: maybe as class method
62
+ def uris_with_same_host(uri)
63
+ if @result[uri][:link] && @result[uri][:link]['a']
64
+ @result[uri][:link]['a'].to_a.map do |link|
65
+ link['href'] if same_host?(link['href'])
66
+ end.compact
67
+ else
68
+ []
69
+ end
70
+ end
71
+
72
+ def fetch_html(uri)
73
+ uri = Addressable::URI.parse(uri.strip).normalize.to_str
74
+ begin
75
+ response = ::RestClient.get uri, accept: :html
76
+ content_type = response.headers[:content_type].to_s
77
+ case response.code
78
+ when 200...400
79
+ response.to_str if html?(content_type)
80
+ else
81
+ false
82
+ end
83
+ rescue => e
84
+ puts "error fetching html on #{uri}"
85
+ puts e
86
+ nil
87
+ end
88
+ end
89
+
90
+ def html?(content_type)
91
+ content_type.start_with?('text/html')
92
+ end
93
+
94
+ def same_host?(uri)
95
+ !uri.to_s.empty? && Addressable::URI.parse(uri).host.eql?(@uri.host)
96
+ end
97
+
98
+ def init_services_hash(html)
99
+ @options[:services].map do |s|
100
+ s = s.to_sym
101
+ service_class = SERVICE_MAPPING[s]
102
+ available_options = service_class.available_options
103
+ opts = available_options.map { |o| [o, @options[o.to_sym]] }.to_h
104
+ service = service_class.new(html, opts)
105
+ [service_class.keyname, service]
106
+ end.to_h
107
+ end
108
+
109
+ def canonize_html(html)
110
+ baseurl = "#{@uri.scheme}://#{@uri.host}"
111
+ converter = Service::AbsoluteUrl.new(html, baseurl: baseurl)
112
+ converter.execute
113
+ end
114
+
115
+ def perform_services(html)
116
+ result = {}
117
+ html = canonize_html(html)
118
+ services_hash = init_services_hash(html)
119
+ services_hash.each do |json_field, service_class|
120
+ result[json_field.to_sym] = service_class.execute
121
+ end
122
+ result
123
+ end
124
+ end
125
+ end
@@ -0,0 +1,32 @@
1
+ # @author Manuel Dudda
2
+ module Abrupt
3
+ module Service
4
+ # Complexity service
5
+ # documentation see 'http://wba.cs.hs-rm.de/AbRUPt/service/absoluteurl/'
6
+ class AbsoluteUrl < Base
7
+ # TODO: outsource service uri to module Service
8
+ SERVICE_URI = 'http://wba.cs.hs-rm.de/AbRUPt/service/absoluteurl/'
9
+
10
+ def service_uri
11
+ SERVICE_URI
12
+ end
13
+
14
+ def execute
15
+ options = {
16
+ method: :post,
17
+ timeout: 6000,
18
+ open_timeout: 6000,
19
+ accept: :html
20
+ }
21
+ options.merge!(url: @url, payload: @html)
22
+ begin
23
+ RestClient::Request.execute(options).to_str
24
+ rescue => e
25
+ puts "some problems with #{@url}"
26
+ puts e
27
+ nil
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,75 @@
1
+ # @author Manuel Dudda
2
+ require 'rest_client'
3
+ module Abrupt
4
+ module Service
5
+ # base class
6
+ class Base
7
+ attr_accessor :url, :abbr, :options, :response
8
+ # TODO: outsource service uri to module Service
9
+ SERVICE_URI = 'http://wba.cs.hs-rm.de/AbRUPt/service/complexity/public/index.php/api/v1/complexity'
10
+
11
+ def service_uri
12
+ SERVICE_URI
13
+ end
14
+
15
+ def initialize(html, options = {})
16
+ @html = html
17
+ @options = options
18
+ query_params = if @options.count > 0
19
+ options_arr = @options.map { |k, v| "#{k}=#{v}" }
20
+ '?' + options_arr.reduce { |a, e| "#{a}&#{e}" }
21
+ else
22
+ ''
23
+ end
24
+ @url = service_uri + query_params
25
+ @abbr = self.class.name[0].downcase
26
+ @options = []
27
+ end
28
+
29
+ def self.available_options
30
+ []
31
+ end
32
+
33
+ def self.keyname
34
+ name.split('::').last.downcase
35
+ end
36
+
37
+ # TODO: naming of interface execute
38
+ def execute
39
+ options = {
40
+ method: :post,
41
+ timeout: 6000,
42
+ open_timeout: 6000,
43
+ accept: :schema
44
+ }
45
+ options.merge!(url: @url, payload: @html)
46
+ begin
47
+ res = RestClient::Request.execute(options).to_str
48
+ @response = JSON.parse(res)
49
+ rescue => e
50
+ puts "some problems with #{@url}"
51
+ puts e
52
+ nil
53
+ end
54
+ end
55
+
56
+ def self.transform_hash(hsh)
57
+ uri = Addressable::URI.parse(hsh.keys.first).normalize
58
+ result = {
59
+ website: {
60
+ domain: "#{uri.scheme}://#{uri.host}",
61
+ url: []
62
+ }
63
+ }
64
+ hsh.each_with_index do |(key, value), _index|
65
+ page = {
66
+ name: key,
67
+ state: value
68
+ }
69
+ result[:website][:url] << page
70
+ end
71
+ result.deep_symbolize_keys
72
+ end
73
+ end
74
+ end
75
+ end