vizi_tracker 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. data/README.rdoc +116 -0
  2. data/Rakefile +44 -0
  3. data/config/logger-backup.yml +49 -0
  4. data/config/logger.yml +69 -0
  5. data/config/logger_apache.yml +65 -0
  6. data/config/logger_sample.yml +69 -0
  7. data/data/exlog.log +5458 -0
  8. data/data/sample-alter.log +11870 -0
  9. data/data/sample-surf.log +47 -0
  10. data/data/sample-wle.log +30474 -0
  11. data/data/testlog.log +270 -0
  12. data/data/vizitrax.log +17951 -0
  13. data/doc/Object.html +200 -0
  14. data/doc/ParserTest.html +268 -0
  15. data/doc/README_rdoc.html +128 -0
  16. data/doc/Rakefile.html +148 -0
  17. data/doc/Visit.html +487 -0
  18. data/doc/VisitList.html +385 -0
  19. data/doc/Vizi/LogFormat.html +377 -0
  20. data/doc/Vizi/LogParser.html +551 -0
  21. data/doc/Vizi/Visit.html +487 -0
  22. data/doc/Vizi/VisitList.html +386 -0
  23. data/doc/Vizi.html +168 -0
  24. data/doc/ViziLogFormat.html +382 -0
  25. data/doc/ViziLogParser.html +551 -0
  26. data/doc/created.rid +8 -0
  27. data/doc/formats/apache-custom-log.pdf +0 -0
  28. data/doc/formats/apache.rtf +238 -0
  29. data/doc/formats/format-descriptions.xls +0 -0
  30. data/doc/formats/w3c-extended.pdf +0 -0
  31. data/doc/formats/w3c-extended.rtf +135 -0
  32. data/doc/images/brick.png +0 -0
  33. data/doc/images/brick_link.png +0 -0
  34. data/doc/images/bug.png +0 -0
  35. data/doc/images/bullet_black.png +0 -0
  36. data/doc/images/bullet_toggle_minus.png +0 -0
  37. data/doc/images/bullet_toggle_plus.png +0 -0
  38. data/doc/images/date.png +0 -0
  39. data/doc/images/find.png +0 -0
  40. data/doc/images/loadingAnimation.gif +0 -0
  41. data/doc/images/macFFBgHack.png +0 -0
  42. data/doc/images/package.png +0 -0
  43. data/doc/images/page_green.png +0 -0
  44. data/doc/images/page_white_text.png +0 -0
  45. data/doc/images/page_white_width.png +0 -0
  46. data/doc/images/plugin.png +0 -0
  47. data/doc/images/ruby.png +0 -0
  48. data/doc/images/tag_green.png +0 -0
  49. data/doc/images/wrench.png +0 -0
  50. data/doc/images/wrench_orange.png +0 -0
  51. data/doc/images/zoom.png +0 -0
  52. data/doc/index.html +112 -0
  53. data/doc/js/darkfish.js +116 -0
  54. data/doc/js/jquery.js +32 -0
  55. data/doc/js/quicksearch.js +114 -0
  56. data/doc/js/thickbox-compressed.js +10 -0
  57. data/doc/lib/vizi/parser_rb.html +63 -0
  58. data/doc/lib/vizi/vizi_tracker_rb.html +63 -0
  59. data/doc/lib/vizi_log_parser_rb.html +56 -0
  60. data/doc/lib/vizi_tracker_rb.html +56 -0
  61. data/doc/rdoc.css +759 -0
  62. data/doc/test/parser_test_rb.html +54 -0
  63. data/doc/test/test_helper_rb.html +56 -0
  64. data/doc/testit_rb.html +63 -0
  65. data/lib/vizi/vizi_tracker.rb +406 -0
  66. data/lib/vizi_tracker.rb +5 -0
  67. data/log/parse.log +79 -0
  68. data/log/system.log +66 -0
  69. data/test/parser_test.rb +48 -0
  70. data/test/test_helper.rb +3 -0
  71. data/testit.rb +105 -0
  72. data/vizi_tracker.gemspec +21 -0
  73. metadata +146 -0
data/README.rdoc ADDED
@@ -0,0 +1,116 @@
1
+ = ViziTracker
2
+
3
+ == Introduction
4
+
5
+ This gem provides a set of classes to support the parsing of web log files and
6
+ the creation of Visit records from the individual parsed web log records
7
+
8
+ == Installation
9
+
10
+ Just run:
11
+
12
+ gem install vizi_tracker
13
+
14
+ The following code uses a YAML file to store the configuration attributes that wil
15
+ drive the logging activity. This file (logger.yml) should be setup and stored in the
16
+ config sub-directory. Refer also to logger_sample.yml for more details on the
17
+ configuration values.
18
+
19
+ == Usage
20
+
21
+ require 'vizi_tracker'
22
+ require 'yaml'
23
+ require 'logger'
24
+
25
+ config = YAML.load_file("config/logger.yml")
26
+
27
+ @@download_page_number = config["download_page_number"]
28
+ @@visit_timeout = config["visit_timeout"]
29
+
30
+ # Initialize the log parser
31
+ parser = Vizi::LogParser.new(config["drop_ips"], config["spider_ips"],
32
+ config["spider_names"], config["page_urls"], config["hide_urls"],
33
+ config["homepage"], config["accept_only_homepage"],config["hostname"],
34
+ config["drop_refers_by_hostname"], config["use_local_time"],
35
+ config["assigned_numbers"], config["match_page_numbers"])
36
+
37
+ syslog = Logger.new('./log/system.log',shift_age = 'weekly')
38
+ case config["log_level"]
39
+ when "info"
40
+ syslog.level = Logger::INFO
41
+ when "warn"
42
+ syslog.level = Logger::WARN
43
+ when "error"
44
+ syslog.level = Logger::ERROR
45
+ when "fatal"
46
+ syslog.level = Logger::FATAL
47
+ else
48
+ syslog.level = Logger::DEBUG
49
+ end
50
+ syslog.info "starting ... >>> "+Time.now.to_s
51
+
52
+ # Open log file for reading
53
+ File.open('./data/exlog.log', 'r') do |file|
54
+ vlist = Vizi::VisitList.new
55
+ rec_count = 0
56
+ hit_count = 0
57
+ max_rec_count = 99999
58
+ max_rec_count = config["max_rec_count"] if config["max_rec_count"]
59
+ visit_count = 0
60
+ page_count = 0
61
+ human_count = 0
62
+ drop_count = 0
63
+ spider_count = 0
64
+ start_time = Time.now
65
+ logformat = nil
66
+ # Begin to parse each record
67
+ while(line = file.gets)
68
+ parsed_data = parser.parse_line(line, logformat)
69
+ logformat = parsed_data[:p_logformat]
70
+ rec_count = rec_count + 1
71
+ next if parsed_data[:p_linetype] != "V"
72
+ hit_count = hit_count + 1
73
+ page_count = page_count + 1 if parsed_data[:p_pageflag]
74
+ @visit=vlist.find_by_ip(parsed_data[:ip])
75
+ if @visit.nil?
76
+ vlist.append(Vizi::Visit.new(parsed_data[:ip],parsed_data[:datetime],parsed_data[:csuristem],parsed_data[:csuriquery], parsed_data[:timetaken],
77
+ parsed_data[:p_visitortype],parsed_data[:p_pageflag],parsed_data[:p_searchphrase],parsed_data[:p_pageid]))
78
+ @visit=vlist.find_by_ip(parsed_data[:ip])
79
+ visit_count = visit_count + 1
80
+ else
81
+ @visit.update(parsed_data[:datetime],parsed_data[:csuriquery],parsed_data[:timetaken],
82
+ parsed_data[:p_visitortype],parsed_data[:p_pageflag],parsed_data[:p_searchphrase], parsed_data[:p_pageid])
83
+ end
84
+ @visits = vlist.find_expired(@visit.start_dt)
85
+ if @visits
86
+ @visits.sendoutput
87
+ vlist.delete(@visits)
88
+ human_count = human_count + 1 if @visits.visitortype == "H"
89
+ drop_count = drop_count + 1 if @visits.visitortype == "D"
90
+ spider_count = spider_count + 1 if @visits.visitortype == "S"
91
+ end
92
+ break if rec_count == max_rec_count
93
+ end
94
+ @visits = vlist.find_all
95
+ @visits.each {|v|
96
+ v.sendoutput
97
+ human_count = human_count + 1 if v.visitortype == "H"
98
+ drop_count = drop_count + 1 if v.visitortype == "D"
99
+ spider_count = spider_count + 1 if v.visitortype == "S"
100
+ }
101
+ if config["summary_flag"]
102
+ syslog.info "Record count is "+rec_count.to_s
103
+ syslog.info "Hit count is "+hit_count.to_s
104
+ syslog.info "Page count is "+page_count.to_s
105
+ syslog.info "Total visit count is "+visit_count.to_s
106
+ syslog.info "Human visit count is "+human_count.to_s
107
+ syslog.info "Drop visit count is "+drop_count.to_s
108
+ syslog.info "Spider visit count is "+spider_count.to_s
109
+ syslog.info "Batch processing time "+(Time.now-start_time).to_s
110
+ end
111
+ syslog.info "ending ... >>> "+Time.now.to_s
112
+ end
113
+
114
+ == License
115
+
116
+ This code is made available under the MIT license. It is based on original parser code from Jan Wikholm.
data/Rakefile ADDED
@@ -0,0 +1,44 @@
1
+ require 'rubygems'
2
+ require 'rake/gempackagetask'
3
+ require 'rake/rdoctask'
4
+ require 'rake/testtask'
5
+
6
+ spec = Gem::Specification.new do |s|
7
+ s.name = "vizi_tracker"
8
+ s.version = "0.1.0"
9
+ s.author = "Al Kivi"
10
+ s.email = "al.kivi at vizitrax.com"
11
+ s.homepage = "http://github.com/al-kivi/vizi_tracker"
12
+ s.description = "Univeral web log file parser and visit analyzer"
13
+ s.summary = "A package for parsing web server logs and creating visit records"
14
+
15
+ s.platform = Gem::Platform::RUBY
16
+ s.has_rdoc = true
17
+ s.extra_rdoc_files = ["README.rdoc"]
18
+
19
+ s.require_path = "lib"
20
+ s.files = %w(README.rdoc Rakefile) + Dir.glob("lib/**/*")
21
+ end
22
+
23
+ Rake::GemPackageTask.new(spec) do |pkg|
24
+ pkg.need_tar = true
25
+ end
26
+
27
+ Rake::RDocTask.new(:rdoc) do |rdoc|
28
+ rdoc.rdoc_dir = 'rdoc'
29
+ rdoc.title = 'ViziTracker'
30
+ rdoc.options << '--line-numbers' << '--inline-source'
31
+ rdoc.rdoc_files.include('README')
32
+ rdoc.rdoc_files.include('lib/**/*.rb')
33
+ end
34
+
35
+ Rake::TestTask.new do |t|
36
+ t.libs << 'test'
37
+ t.test_files = FileList["test/**/*_test.rb"]
38
+ t.verbose = true
39
+ end
40
+
41
+ task :default => "pkg/#{spec.name}-#{spec.version}.gem" do
42
+ puts "generated latest version"
43
+ end
44
+
@@ -0,0 +1,49 @@
1
+ # Configuration data for logger
2
+ visit_timeout: 1200
3
+ summary_flag: true
4
+ convert_to_lower_case: true
5
+ #max_rec_count: 100
6
+ homepage: home.aspx
7
+ accept_only_homepage: true
8
+ page_extensions:
9
+ - aspx
10
+ - asp
11
+ hostname: www.sigma-systems.com
12
+ drop_refers_by_hostname: true
13
+ drop_ips:
14
+ - 76.12.185.100
15
+ spider_ips:
16
+ - 66.98.254.55
17
+ - 64.208.168.252
18
+ - 64.235.108.183
19
+ - 76.2.144.115
20
+ - 66.98.254.236
21
+ - 202.108.22.132
22
+ - 89.122.29.77
23
+ - 95.174.93.222
24
+ - 66.55.37.179
25
+ - 198.45.18.20
26
+ - 38.104.227.3
27
+ spider_names:
28
+ - bot
29
+ - spider
30
+ - slurp
31
+ - root.exe
32
+ - .dll
33
+ - slurp
34
+ - looksmart
35
+ - nutchsvc
36
+ - iconsurf
37
+ - objectsearch
38
+ - openfind
39
+ - iltovatore
40
+ - mozdex
41
+ - netresearch
42
+ - konsqueror
43
+ - crawler
44
+ - searchme
45
+ - java/1.6.0_04
46
+ - scoutjet
47
+ - yeti
48
+ - yandex
49
+ drop_spiders: true
data/config/logger.yml ADDED
@@ -0,0 +1,69 @@
1
+ # Configuration data for logger
2
+ #max_rec_count: 100
3
+ visit_timeout: 1200
4
+ summary_flag: true
5
+ convert_to_lower_case: true
6
+ log_level: warn
7
+ #log_level options are debug, info, warn, error, fatal
8
+ homepage: /home.aspx
9
+ #homepage: /
10
+ accept_only_homepage: true
11
+ page_urls:
12
+ - aspx
13
+ - asp
14
+ hide_urls:
15
+ - css
16
+ - js
17
+ - gif
18
+ - swf
19
+ - ico
20
+ - chart
21
+ - robots
22
+ hostname: www.sigma-systems.com
23
+ drop_refers_by_hostname: true
24
+ use_local_time: true
25
+ download_page_number: 45
26
+ drop_ips:
27
+ - 76.12.185.100
28
+ spider_ips:
29
+ - 66.98.254.55
30
+ - 64.208.168.252
31
+ - 64.235.108.183
32
+ - 76.2.144.115
33
+ - 66.98.254.236
34
+ - 202.108.22.132
35
+ - 89.122.29.77
36
+ - 95.174.93.222
37
+ - 66.55.37.179
38
+ - 198.45.18.20
39
+ - 38.104.227.3
40
+ spider_names:
41
+ - bot
42
+ - spider
43
+ - slurp
44
+ - root.exe
45
+ - .dll
46
+ - slurp
47
+ - looksmart
48
+ - nutchsvc
49
+ - iconsurf
50
+ - objectsearch
51
+ - openfind
52
+ - iltovatore
53
+ - mozdex
54
+ - netresearch
55
+ - konsqueror
56
+ - crawler
57
+ - searchme
58
+ - java/1.6.0_04
59
+ - scoutjet
60
+ - yeti
61
+ - yandex
62
+ # convert urls to assigned numbers where numbers cannot be parsed from url
63
+ assigned_numbers:
64
+ - /visit/index,1
65
+ - /visit/show/,2
66
+ - /visit/vcardedit/,3
67
+ - /visit/showmap_na,4
68
+ - /visit/showmap_row,5
69
+ match_page_numbers: false
@@ -0,0 +1,65 @@
1
+ # Configuration data for logger
2
+ #max_rec_count: 100
3
+ visit_timeout: 1200
4
+ summary_flag: true
5
+ convert_to_lower_case: true
6
+ #homepage: /home.aspx
7
+ homepage: /
8
+ accept_only_homepage: true
9
+ page_urls:
10
+ - aspx
11
+ - asp
12
+ hide_urls:
13
+ - css
14
+ - js
15
+ - gif
16
+ - swf
17
+ - ico
18
+ - chart
19
+ - robots
20
+ hostname: www.sigma-systems.com
21
+ drop_refers_by_hostname: true
22
+ getlocal_time: true
23
+ drop_ips:
24
+ - 76.12.185.100
25
+ spider_ips:
26
+ - 66.98.254.55
27
+ - 64.208.168.252
28
+ - 64.235.108.183
29
+ - 76.2.144.115
30
+ - 66.98.254.236
31
+ - 202.108.22.132
32
+ - 89.122.29.77
33
+ - 95.174.93.222
34
+ - 66.55.37.179
35
+ - 198.45.18.20
36
+ - 38.104.227.3
37
+ spider_names:
38
+ - bot
39
+ - spider
40
+ - slurp
41
+ - root.exe
42
+ - .dll
43
+ - slurp
44
+ - looksmart
45
+ - nutchsvc
46
+ - iconsurf
47
+ - objectsearch
48
+ - openfind
49
+ - iltovatore
50
+ - mozdex
51
+ - netresearch
52
+ - konsqueror
53
+ - crawler
54
+ - searchme
55
+ - java/1.6.0_04
56
+ - scoutjet
57
+ - yeti
58
+ - yandex
59
+ page_numbers:
60
+ - /visit/index,1
61
+ - /visit/show/,2
62
+ - /visit/vcardedit/,3
63
+ - /visit/showmap_na,4
64
+ - /visit/showmap_row,5
65
+ match_page_numbers: true
@@ -0,0 +1,69 @@
1
+ # Configuration data for logger
2
+ #max_rec_count: 100
3
+ visit_timeout: 1200
4
+ summary_flag: true
5
+ convert_to_lower_case: true
6
+ log_level: warn
7
+ #log_level options are debug, info, warn, error, fatal
8
+ homepage: /home.aspx
9
+ #homepage: /
10
+ accept_only_homepage: true
11
+ page_urls:
12
+ - aspx
13
+ - asp
14
+ hide_urls:
15
+ - css
16
+ - js
17
+ - gif
18
+ - swf
19
+ - ico
20
+ - chart
21
+ - robots
22
+ hostname: www.sigma-systems.com
23
+ drop_refers_by_hostname: true
24
+ use_local_time: true
25
+ download_page_number: 45
26
+ drop_ips:
27
+ - 76.12.185.100
28
+ spider_ips:
29
+ - 66.98.254.55
30
+ - 64.208.168.252
31
+ - 64.235.108.183
32
+ - 76.2.144.115
33
+ - 66.98.254.236
34
+ - 202.108.22.132
35
+ - 89.122.29.77
36
+ - 95.174.93.222
37
+ - 66.55.37.179
38
+ - 198.45.18.20
39
+ - 38.104.227.3
40
+ spider_names:
41
+ - bot
42
+ - spider
43
+ - slurp
44
+ - root.exe
45
+ - .dll
46
+ - slurp
47
+ - looksmart
48
+ - nutchsvc
49
+ - iconsurf
50
+ - objectsearch
51
+ - openfind
52
+ - iltovatore
53
+ - mozdex
54
+ - netresearch
55
+ - konsqueror
56
+ - crawler
57
+ - searchme
58
+ - java/1.6.0_04
59
+ - scoutjet
60
+ - yeti
61
+ - yandex
62
+ # convert urls to assigned numbers where numbers cannot be parsed from url
63
+ assigned_numbers:
64
+ - /visit/index,1
65
+ - /visit/show/,2
66
+ - /visit/vcardedit/,3
67
+ - /visit/showmap_na,4
68
+ - /visit/showmap_row,5
69
+ match_page_numbers: false