websitary 0.3 → 0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +11 -0
- data/README.txt +21 -9
- data/Rakefile +5 -1
- data/bin/websitary +2 -2
- data/lib/websitary.rb +11 -95
- data/lib/websitary/configuration.rb +224 -35
- data/lib/websitary/htmldiff.rb +22 -4
- metadata +60 -53
data/History.txt
CHANGED
@@ -1,3 +1,14 @@
|
|
1
|
+
= 0.4
|
2
|
+
|
3
|
+
* Sources may have a :timeout option.
|
4
|
+
* exclude: Argument can be a string or a regexp.
|
5
|
+
* htmldiff: :ignore option to exclude certain nodes from the diff.
|
6
|
+
* Left-mouse clicks make items collapse/expand.
|
7
|
+
* iconv: Support for converting encodings (require the per-url iconv
|
8
|
+
option to be set).
|
9
|
+
* exclude mailto urls.
|
10
|
+
|
11
|
+
|
1
12
|
= 0.3
|
2
13
|
|
3
14
|
* Renamed the global option :downloadhtml to :download_html.
|
data/README.txt
CHANGED
@@ -189,9 +189,13 @@ This is the same a <tt>option :global, OPTION => VALUE</tt>.
|
|
189
189
|
|
190
190
|
Known global options:
|
191
191
|
|
192
|
-
<tt>:
|
193
|
-
|
194
|
-
|
192
|
+
<tt>:canonic_filename => BLOCK(FILENAME)</tt>::
|
193
|
+
Rewrite filenames as they are stored in the mtimes register. This may
|
194
|
+
useful if you want to use the same repository on several computers
|
195
|
+
with in different locations etc.
|
196
|
+
|
197
|
+
<tt>:encoding => OUTPUT_DOCUMENT_ENCODING</tt>::
|
198
|
+
The default is 'ISO-8859-1'.
|
195
199
|
|
196
200
|
<tt>:downloadhtml => SHORTCUT</tt>::
|
197
201
|
The default shortcut for downloading plain HTML.
|
@@ -201,10 +205,12 @@ Known global options:
|
|
201
205
|
copies in the output. This may useful if you want to use the same
|
202
206
|
repository on several computers with in different locations etc.
|
203
207
|
|
204
|
-
<tt>:
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
+
<tt>:filename_size => N</tt>::
|
209
|
+
The max filename size. If a filename becomes longer, md5 encoding will
|
210
|
+
be used for local copies in the cache.
|
211
|
+
|
212
|
+
<tt>:toggle_body => BOOLEAN</tt>::
|
213
|
+
If true, make a news body collabsable on mouse-clicks (sort of).
|
208
214
|
|
209
215
|
|
210
216
|
==== output_format FORMAT, output_format [FORMAT1, FORMAT2, ...]
|
@@ -270,6 +276,14 @@ Options
|
|
270
276
|
wraps the output in +pre+ tags. :webdiff, :body_html, :website_below,
|
271
277
|
:website, and :openuri will simply add a newline character.
|
272
278
|
|
279
|
+
<tt>:iconv => ENCODING</tt>::
|
280
|
+
If set, use iconv to convert the page body into the summary's document
|
281
|
+
encoding (see the 'global' section). Websitary currently isn't able to
|
282
|
+
automatically determine and convert encodings.
|
283
|
+
|
284
|
+
<tt>:timeout => SECONDS</tt>::
|
285
|
+
When using openuri, download the page with a timeout.
|
286
|
+
|
273
287
|
<tt>:hours => HOURS, :days => DAYS</tt>::
|
274
288
|
Don't download the file unless it's older than that
|
275
289
|
|
@@ -733,5 +747,3 @@ along with this program; if not, write to the Free Software
|
|
733
747
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
|
734
748
|
USA
|
735
749
|
|
736
|
-
|
737
|
-
% vi: ft=rd:tw=72:ts=4
|
data/Rakefile
CHANGED
@@ -21,7 +21,11 @@ require 'rtagstask'
|
|
21
21
|
RTagsTask.new
|
22
22
|
|
23
23
|
task :ctags do
|
24
|
-
`ctags --extra=+q --fields=+i -R bin lib`
|
24
|
+
`ctags --extra=+q --fields=+i+S -R bin lib`
|
25
|
+
end
|
26
|
+
|
27
|
+
task :files do
|
28
|
+
`find bin lib -name "*.rb" > files.lst`
|
25
29
|
end
|
26
30
|
|
27
31
|
# vim: syntax=Ruby
|
data/bin/websitary
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
#! /usr/bin/ruby
|
1
|
+
#! /usr/bin/env ruby
|
2
2
|
# websitary.rb -- The website news, rss feed, podcast catching monitor
|
3
|
-
# @Last Change: 2007-
|
3
|
+
# @Last Change: 2007-12-26.
|
4
4
|
# Author:: Thomas Link (micathom at gmail com)
|
5
5
|
# License:: GPL (see http://www.gnu.org/licenses/gpl.txt)
|
6
6
|
# Created:: 2007-06-09.
|
data/lib/websitary.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# websitary.rb
|
2
|
-
# @Last Change:
|
2
|
+
# @Last Change: 2008-01-13.
|
3
3
|
# Author:: Thomas Link (micathom AT gmail com)
|
4
4
|
# License:: GPL (see http://www.gnu.org/licenses/gpl.txt)
|
5
5
|
# Created:: 2007-09-08.
|
@@ -14,6 +14,7 @@ require 'pathname'
|
|
14
14
|
require 'rbconfig'
|
15
15
|
require 'uri'
|
16
16
|
require 'open-uri'
|
17
|
+
require 'timeout'
|
17
18
|
require 'yaml'
|
18
19
|
require 'rss'
|
19
20
|
|
@@ -32,8 +33,8 @@ end
|
|
32
33
|
|
33
34
|
module Websitary
|
34
35
|
APPNAME = 'websitary'
|
35
|
-
VERSION = '0.
|
36
|
-
REVISION = '
|
36
|
+
VERSION = '0.4'
|
37
|
+
REVISION = '2464'
|
37
38
|
end
|
38
39
|
|
39
40
|
require 'websitary/applog'
|
@@ -71,92 +72,7 @@ class Websitary::App
|
|
71
72
|
unless File.exists?(css)
|
72
73
|
$logger.info "Copying default css file: #{css}"
|
73
74
|
@configuration.write_file(css, 'w') do |io|
|
74
|
-
io.puts
|
75
|
-
body {
|
76
|
-
color: black;
|
77
|
-
background-color: #f0f0f0;
|
78
|
-
}
|
79
|
-
a.external {
|
80
|
-
}
|
81
|
-
a.old {
|
82
|
-
}
|
83
|
-
a.latest {
|
84
|
-
}
|
85
|
-
a.toc {
|
86
|
-
}
|
87
|
-
ol.toc {
|
88
|
-
float: left;
|
89
|
-
width: 200px;
|
90
|
-
position: fixed;
|
91
|
-
padding: 0;
|
92
|
-
margin: 0;
|
93
|
-
}
|
94
|
-
li.toc {
|
95
|
-
list-style: none;
|
96
|
-
border: 1px solid #e0e0e0;
|
97
|
-
background-color: #fafafa;
|
98
|
-
padding: 0.1em;
|
99
|
-
font-size: 80%;
|
100
|
-
font-family: Verdana, Myriad Web, Syntax, sans-serif;
|
101
|
-
}
|
102
|
-
li.toc:hover {
|
103
|
-
background-color: #ffff8d;
|
104
|
-
}
|
105
|
-
div.contents {
|
106
|
-
margin-left: 210px;
|
107
|
-
min-width: 16em;
|
108
|
-
}
|
109
|
-
div.webpage {
|
110
|
-
margin: 5px 0 5px 0;
|
111
|
-
padding: 5px;
|
112
|
-
border: 1px solid #e0e0e0;
|
113
|
-
background-color: white;
|
114
|
-
}
|
115
|
-
div.count {
|
116
|
-
text-align: right;
|
117
|
-
}
|
118
|
-
.enclosure {
|
119
|
-
padding: 4px;
|
120
|
-
margin: 4px 0 4px 0;
|
121
|
-
background: #f9f9f9;
|
122
|
-
}
|
123
|
-
h1.diff {
|
124
|
-
font-family: Verdana, Myriad Web, Syntax, sans-serif;
|
125
|
-
}
|
126
|
-
h2.rss {
|
127
|
-
border-top: 10px solid #f0f0f0;
|
128
|
-
padding-top: 10px;
|
129
|
-
}
|
130
|
-
div.diff {
|
131
|
-
padding-left: 2em;
|
132
|
-
}
|
133
|
-
pre.diff {
|
134
|
-
padding-left: 2em;
|
135
|
-
}
|
136
|
-
div.annotation {
|
137
|
-
font-size: 80%;
|
138
|
-
}
|
139
|
-
hr.separator {
|
140
|
-
width: 100%;
|
141
|
-
visibility: hidden;
|
142
|
-
}
|
143
|
-
.error {
|
144
|
-
color: yellow;
|
145
|
-
background-color: red;
|
146
|
-
}
|
147
|
-
.highlight-yellow {
|
148
|
-
background-color: #ffc730;
|
149
|
-
}
|
150
|
-
.highlight-red {
|
151
|
-
background-color: red;
|
152
|
-
}
|
153
|
-
.highlight-blue {
|
154
|
-
background-color: blue;
|
155
|
-
}
|
156
|
-
.highlight-aqua {
|
157
|
-
background-color: aqua;
|
158
|
-
}
|
159
|
-
CSS
|
75
|
+
io.puts @configuration.get_option(:page, :css)
|
160
76
|
end
|
161
77
|
end
|
162
78
|
end
|
@@ -318,7 +234,7 @@ CSS
|
|
318
234
|
difftext.delete('')
|
319
235
|
unless difftext.empty?
|
320
236
|
joindiffs = @configuration.get(url, :joindiffs, lambda {|t| t.join("\n")})
|
321
|
-
difftext = @configuration.call_cmd(joindiffs, [difftext]) if joindiffs
|
237
|
+
difftext = @configuration.call_cmd(joindiffs, [difftext], :url => url) if joindiffs
|
322
238
|
accumulate(url, difftext, opts)
|
323
239
|
end
|
324
240
|
aggrfiles.each do |file|
|
@@ -437,7 +353,7 @@ CSS
|
|
437
353
|
|
438
354
|
$logger.warn "Download: #{@configuration.get(url, :title, url).inspect}"
|
439
355
|
@configuration.done << url
|
440
|
-
text = @configuration.call_cmd(@configuration.get(url, :download), [url])
|
356
|
+
text = @configuration.call_cmd(@configuration.get(url, :download), [url], :url => url)
|
441
357
|
# $logger.debug text #DBG#
|
442
358
|
unless text
|
443
359
|
$logger.warn "no contents: #{@configuration.get(url, :title, url)}"
|
@@ -477,7 +393,7 @@ CSS
|
|
477
393
|
pprc = @configuration.get(url, :downloadprocess)
|
478
394
|
if pprc
|
479
395
|
$logger.debug "download process: #{pprc}"
|
480
|
-
text = @configuration.call_cmd(pprc, [text])
|
396
|
+
text = @configuration.call_cmd(pprc, [text], :url => url)
|
481
397
|
# $logger.debug text #DBG#
|
482
398
|
end
|
483
399
|
|
@@ -500,13 +416,13 @@ CSS
|
|
500
416
|
def diff(url, opts, new, old)
|
501
417
|
if File.exists?(old)
|
502
418
|
$logger.debug "diff: #{old} <-> #{new}"
|
503
|
-
difftext = @configuration.call_cmd(@configuration.get(url, :diff), [old, new])
|
419
|
+
difftext = @configuration.call_cmd(@configuration.get(url, :diff), [old, new], :url => url)
|
504
420
|
# $logger.debug "diff: #{difftext}" #DBG#
|
505
421
|
|
506
422
|
if difftext =~ /\S/
|
507
423
|
if (pprc = @configuration.get(url, :diffprocess))
|
508
424
|
$logger.debug "diff process: #{pprc}"
|
509
|
-
difftext = @configuration.call_cmd(pprc, [difftext])
|
425
|
+
difftext = @configuration.call_cmd(pprc, [difftext], :url => url)
|
510
426
|
end
|
511
427
|
# $logger.debug "difftext: #{difftext}" #DBG#
|
512
428
|
if difftext =~ /\S/
|
@@ -514,7 +430,7 @@ CSS
|
|
514
430
|
return difftext
|
515
431
|
end
|
516
432
|
end
|
517
|
-
|
433
|
+
|
518
434
|
$logger.debug "Unchanged: #{@configuration.get(url, :title, url).inspect}"
|
519
435
|
|
520
436
|
elsif File.exist?(new) and
|
@@ -1,5 +1,5 @@
|
|
1
1
|
# configuration.rb
|
2
|
-
# @Last Change:
|
2
|
+
# @Last Change: 2008-01-09.
|
3
3
|
# Author:: Thomas Link (micathom AT gmail com)
|
4
4
|
# License:: GPL (see http://www.gnu.org/licenses/gpl.txt)
|
5
5
|
# Created:: 2007-09-08.
|
@@ -129,7 +129,7 @@ class Websitary::Configuration
|
|
129
129
|
end
|
130
130
|
|
131
131
|
opts.on('-x', '--exclude=N', Regexp, 'Exclude URLs matching this pattern') do |value|
|
132
|
-
exclude(value)
|
132
|
+
exclude(Regexp.new(value))
|
133
133
|
end
|
134
134
|
|
135
135
|
opts.separator ''
|
@@ -337,9 +337,14 @@ class Websitary::Configuration
|
|
337
337
|
|
338
338
|
|
339
339
|
def to_do(url)
|
340
|
-
|
341
|
-
|
342
|
-
|
340
|
+
@todo << url unless is_excluded?(url)
|
341
|
+
end
|
342
|
+
|
343
|
+
|
344
|
+
def is_excluded?(url)
|
345
|
+
rv = @exclude.any? {|p| url =~ p}
|
346
|
+
$logger.debug "is_excluded: #{url}: #{rv}"
|
347
|
+
rv
|
343
348
|
end
|
344
349
|
|
345
350
|
|
@@ -434,9 +439,19 @@ class Websitary::Configuration
|
|
434
439
|
|
435
440
|
|
436
441
|
# Configuration command:
|
437
|
-
# Add URL-exclusion patterns (REGEXPs).
|
442
|
+
# Add URL-exclusion patterns (REGEXPs or STRINGs).
|
438
443
|
def exclude(*urls)
|
439
|
-
@exclude += urls
|
444
|
+
@exclude += urls.map do |url|
|
445
|
+
case url
|
446
|
+
when Regexp
|
447
|
+
url
|
448
|
+
when String
|
449
|
+
Regexp.new(Regexp.escape(url))
|
450
|
+
else
|
451
|
+
$logger.fatal "Must be regexp or string: #{url.inspect}"
|
452
|
+
exit 5
|
453
|
+
end
|
454
|
+
end
|
440
455
|
end
|
441
456
|
|
442
457
|
|
@@ -461,10 +476,26 @@ class Websitary::Configuration
|
|
461
476
|
end
|
462
477
|
|
463
478
|
|
479
|
+
def format_text(url, text)
|
480
|
+
enc = get(url, :iconv)
|
481
|
+
if enc
|
482
|
+
denc = get_optionvalue(:global, :encoding)
|
483
|
+
begin
|
484
|
+
require 'iconv'
|
485
|
+
text = Iconv.conv(denc, enc, text)
|
486
|
+
rescue Exception => e
|
487
|
+
$logger.error "IConv failed #{enc} => #{denc}: #{e}"
|
488
|
+
end
|
489
|
+
end
|
490
|
+
return text
|
491
|
+
end
|
492
|
+
|
493
|
+
|
464
494
|
# Format a diff according to URL's source options.
|
465
495
|
def format(url, difftext)
|
466
|
-
fmt
|
467
|
-
|
496
|
+
fmt = get(url, :format)
|
497
|
+
text = format_text(url, difftext)
|
498
|
+
eval_arg(fmt, [text], text)
|
468
499
|
end
|
469
500
|
|
470
501
|
|
@@ -493,8 +524,22 @@ class Websitary::Configuration
|
|
493
524
|
|
494
525
|
# Apply the argument to cmd (a format String or a Proc). If a
|
495
526
|
# String, execute the command.
|
496
|
-
def call_cmd(cmd,
|
497
|
-
|
527
|
+
def call_cmd(cmd, cmdargs, args={})
|
528
|
+
default = args[:default]
|
529
|
+
url = args[:url]
|
530
|
+
timeout = url ? get(url, :timeout) : nil
|
531
|
+
if timeout
|
532
|
+
begin
|
533
|
+
Timeout::timeout(timeout) do |timeout_length|
|
534
|
+
eval_arg(cmd, cmdargs, default) {|cmd| `#{cmd}`}
|
535
|
+
end
|
536
|
+
rescue Timeout::Error
|
537
|
+
$logger.error "Timeout #{timeout}: #{url}"
|
538
|
+
return default
|
539
|
+
end
|
540
|
+
else
|
541
|
+
eval_arg(cmd, cmdargs, default) {|cmd| `#{cmd}`}
|
542
|
+
end
|
498
543
|
end
|
499
544
|
|
500
545
|
|
@@ -630,15 +675,17 @@ class Websitary::Configuration
|
|
630
675
|
ext = %{ (#{old}, #{lst})}
|
631
676
|
urlr = url
|
632
677
|
end
|
633
|
-
note
|
678
|
+
note = difftext_annotation(url)
|
679
|
+
onclick = get_optionvalue(:global, :toggle_body) ? 'onclick="ToggleBody(this)"' : ''
|
634
680
|
<<HTML
|
635
|
-
<div id="#{bid}" class="webpage">
|
681
|
+
<div id="#{bid}" class="webpage" #{onclick}>
|
636
682
|
<div class="count">
|
637
683
|
#{idx}
|
638
684
|
</div>
|
639
685
|
<h1 class="diff">
|
640
|
-
<a class="external" href="#{urlr}">#{ti}</a>#{ext}
|
686
|
+
<a class="external" href="#{urlr}">#{format_text(url, ti)}</a>#{ext}
|
641
687
|
</h1>
|
688
|
+
<div id="#{bid}_body">
|
642
689
|
<div class="annotation">
|
643
690
|
#{note && CGI::escapeHTML(note)}
|
644
691
|
</div>
|
@@ -646,6 +693,7 @@ class Websitary::Configuration
|
|
646
693
|
#{format(url, text)}
|
647
694
|
</div>
|
648
695
|
</div>
|
696
|
+
</div>
|
649
697
|
HTML
|
650
698
|
end.join(('<hr class="separator"/>') + "\n")
|
651
699
|
|
@@ -795,7 +843,8 @@ HTML
|
|
795
843
|
# already included.
|
796
844
|
def push_hrefs(url, hpricot, &condition)
|
797
845
|
begin
|
798
|
-
|
846
|
+
$logger.debug "push_refs: #{url}"
|
847
|
+
return if robots?(hpricot, 'nofollow') or is_excluded?(url)
|
799
848
|
depth = get(url, :depth)
|
800
849
|
return if depth and depth <= 0
|
801
850
|
uri0 = URI.parse(url)
|
@@ -804,8 +853,8 @@ HTML
|
|
804
853
|
(hpricot / 'a').each do |a|
|
805
854
|
next if a['rel'] == 'nofollow'
|
806
855
|
href = a['href']
|
807
|
-
next if href.nil? or href == url or href =~ /^\s*javascript:/
|
808
|
-
|
856
|
+
next if href.nil? or href == url or href =~ /^\s*javascript:/ or href =~ /^\s*mailto:/ or is_excluded?(href)
|
857
|
+
uri = URI.parse(href)
|
809
858
|
pn = guess_dir(uri.path)
|
810
859
|
href = rewrite_href(href, url, uri0, pn0, true)
|
811
860
|
curl = canonic_url(href)
|
@@ -838,17 +887,33 @@ HTML
|
|
838
887
|
uri = URI.parse(url)
|
839
888
|
urd = guess_dir(uri.path)
|
840
889
|
(doc / 'a').each do |a|
|
841
|
-
href =
|
842
|
-
|
890
|
+
href = a['href']
|
891
|
+
if is_excluded?(href)
|
892
|
+
comment_element(doc, a)
|
893
|
+
else
|
894
|
+
href = rewrite_href(href, url, uri, urd, true)
|
895
|
+
a['href'] = href if href
|
896
|
+
end
|
843
897
|
end
|
844
898
|
(doc / 'img').each do |a|
|
845
|
-
href =
|
846
|
-
|
899
|
+
href = a['src']
|
900
|
+
if is_excluded?(href)
|
901
|
+
comment_element(doc, a)
|
902
|
+
else
|
903
|
+
href = rewrite_href(href, url, uri, urd, false)
|
904
|
+
a['src'] = href if href
|
905
|
+
end
|
847
906
|
end
|
848
907
|
doc
|
849
908
|
end
|
850
909
|
|
851
910
|
|
911
|
+
def comment_element(doc, elt)
|
912
|
+
doc.insert_before(elt, '<!-- WEBSITARY: ')
|
913
|
+
doc.insert_after(elt, '-->')
|
914
|
+
end
|
915
|
+
|
916
|
+
|
852
917
|
# Try to make href an absolute url.
|
853
918
|
def rewrite_href(href, url, uri=nil, urd=nil, local=false)
|
854
919
|
begin
|
@@ -961,7 +1026,7 @@ HTML
|
|
961
1026
|
|
962
1027
|
|
963
1028
|
def canonic_filename(filename)
|
964
|
-
call_cmd(get_optionvalue(:global, :canonic_filename), [filename], filename)
|
1029
|
+
call_cmd(get_optionvalue(:global, :canonic_filename), [filename], :default => filename)
|
965
1030
|
end
|
966
1031
|
|
967
1032
|
|
@@ -970,6 +1035,8 @@ HTML
|
|
970
1035
|
@options = {
|
971
1036
|
:global => {
|
972
1037
|
:download_html => :openuri,
|
1038
|
+
:encoding => 'ISO-8859-1',
|
1039
|
+
:toggle_body => false,
|
973
1040
|
},
|
974
1041
|
}
|
975
1042
|
|
@@ -996,9 +1063,13 @@ HTML
|
|
996
1063
|
:raw => :new,
|
997
1064
|
|
998
1065
|
:htmldiff => lambda {|old, new|
|
999
|
-
|
1000
|
-
|
1001
|
-
|
1066
|
+
url = url_from_filename(new)
|
1067
|
+
args = {
|
1068
|
+
:oldhtml => File.read(old),
|
1069
|
+
:newhtml => File.read(new),
|
1070
|
+
:ignore => get(url, :ignore),
|
1071
|
+
}
|
1072
|
+
difftext = Websitary::Htmldiff.new(args).diff
|
1002
1073
|
difftext
|
1003
1074
|
},
|
1004
1075
|
|
@@ -1130,7 +1201,8 @@ HTML
|
|
1130
1201
|
rss_diff = Websitary::Htmldiff.new(:highlight => 'highlight', :oldtext => olditem.description, :newtext => item.description).process
|
1131
1202
|
rnew << format_rss_item(item, rss_diff)
|
1132
1203
|
else
|
1133
|
-
|
1204
|
+
enc = item.respond_to?(:enclosure) && item.enclosure
|
1205
|
+
if enc and (curl = enc.url)
|
1134
1206
|
url = url_from_filename(new)
|
1135
1207
|
dir = get(url, :rss_enclosure)
|
1136
1208
|
curl = rewrite_href(curl, url, nil, nil, true)
|
@@ -1229,15 +1301,31 @@ HTML
|
|
1229
1301
|
}
|
1230
1302
|
|
1231
1303
|
@options[:page] = {
|
1232
|
-
:format => lambda
|
1304
|
+
:format => lambda {|ti, li, bd|
|
1233
1305
|
template = <<OUT
|
1234
1306
|
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
1235
1307
|
<html>
|
1236
1308
|
<head>
|
1237
1309
|
<title>%s</title>
|
1310
|
+
<meta http-equiv="Content-Type" content="text/html; charset=#{get_optionvalue(:global, :encoding)}">
|
1238
1311
|
<link rel="stylesheet" href="websitary.css" type="text/css">
|
1239
1312
|
<link rel="alternate" href="websitary.rss" type="application/rss+xml" title="%s">
|
1240
1313
|
</head>
|
1314
|
+
<script type="text/javascript">
|
1315
|
+
function ToggleBody(Item) {
|
1316
|
+
var Body = document.getElementById(Item.id + "_body");
|
1317
|
+
if (Body.style.visibility == "collapse") {
|
1318
|
+
Body.style.visibility = "visible";
|
1319
|
+
Body.style.height = "";
|
1320
|
+
Item.style.background = "";
|
1321
|
+
} else {
|
1322
|
+
Body.style.visibility = "collapse";
|
1323
|
+
Body.style.height = "1px";
|
1324
|
+
Item.style.background = "#e0f0f0";
|
1325
|
+
}
|
1326
|
+
return '';
|
1327
|
+
}
|
1328
|
+
</script>
|
1241
1329
|
<body>
|
1242
1330
|
<ol class="toc">
|
1243
1331
|
%s
|
@@ -1249,7 +1337,96 @@ HTML
|
|
1249
1337
|
</html>
|
1250
1338
|
OUT
|
1251
1339
|
template % [ti, ti, li, bd]
|
1252
|
-
|
1340
|
+
},
|
1341
|
+
:css => <<CSS,
|
1342
|
+
body {
|
1343
|
+
color: black;
|
1344
|
+
background-color: #f0f0f0;
|
1345
|
+
}
|
1346
|
+
a.external {
|
1347
|
+
}
|
1348
|
+
a.old {
|
1349
|
+
}
|
1350
|
+
a.latest {
|
1351
|
+
}
|
1352
|
+
a.toc {
|
1353
|
+
}
|
1354
|
+
ol.toc {
|
1355
|
+
float: left;
|
1356
|
+
width: 200px;
|
1357
|
+
position: fixed;
|
1358
|
+
padding: 0;
|
1359
|
+
margin: 0;
|
1360
|
+
}
|
1361
|
+
li.toc {
|
1362
|
+
list-style: none;
|
1363
|
+
border: 1px solid #e0e0e0;
|
1364
|
+
background-color: #fafafa;
|
1365
|
+
padding: 0.1em;
|
1366
|
+
font-size: 80%;
|
1367
|
+
font-family: Verdana, Myriad Web, Syntax, sans-serif;
|
1368
|
+
}
|
1369
|
+
li.toc:hover {
|
1370
|
+
background-color: #ffff8d;
|
1371
|
+
}
|
1372
|
+
div.contents {
|
1373
|
+
margin-left: 210px;
|
1374
|
+
min-width: 16em;
|
1375
|
+
}
|
1376
|
+
div.webpage {
|
1377
|
+
margin: 5px 0 5px 0;
|
1378
|
+
padding: 5px;
|
1379
|
+
border: 1px solid #e0e0e0;
|
1380
|
+
background-color: white;
|
1381
|
+
}
|
1382
|
+
div.count {
|
1383
|
+
text-align: right;
|
1384
|
+
}
|
1385
|
+
.enclosure {
|
1386
|
+
padding: 4px;
|
1387
|
+
margin: 4px 0 4px 0;
|
1388
|
+
background: #f9f9f9;
|
1389
|
+
}
|
1390
|
+
h1.diff {
|
1391
|
+
font-family: Verdana, Myriad Web, Syntax, sans-serif;
|
1392
|
+
}
|
1393
|
+
h2.rss {
|
1394
|
+
border-top: 10px solid #f0f0f0;
|
1395
|
+
padding-top: 10px;
|
1396
|
+
}
|
1397
|
+
div.diff {
|
1398
|
+
padding-left: 2em;
|
1399
|
+
}
|
1400
|
+
pre.diff {
|
1401
|
+
padding-left: 2em;
|
1402
|
+
}
|
1403
|
+
div.annotation {
|
1404
|
+
font-size: 80%;
|
1405
|
+
}
|
1406
|
+
hr.separator {
|
1407
|
+
width: 100%;
|
1408
|
+
visibility: hidden;
|
1409
|
+
}
|
1410
|
+
.error {
|
1411
|
+
color: yellow;
|
1412
|
+
background-color: red;
|
1413
|
+
}
|
1414
|
+
.highlight {
|
1415
|
+
background-color: #fac751;
|
1416
|
+
}
|
1417
|
+
.highlight-yellow {
|
1418
|
+
background-color: #ffc730;
|
1419
|
+
}
|
1420
|
+
.highlight-red {
|
1421
|
+
background-color: red;
|
1422
|
+
}
|
1423
|
+
.highlight-blue {
|
1424
|
+
background-color: blue;
|
1425
|
+
}
|
1426
|
+
.highlight-aqua {
|
1427
|
+
background-color: aqua;
|
1428
|
+
}
|
1429
|
+
CSS
|
1253
1430
|
}
|
1254
1431
|
end
|
1255
1432
|
|
@@ -1293,7 +1470,7 @@ OUT
|
|
1293
1470
|
|
1294
1471
|
|
1295
1472
|
def get_website(download, url)
|
1296
|
-
html = call_cmd(get_optionvalue(:download, download), [url])
|
1473
|
+
html = call_cmd(get_optionvalue(:download, download), [url], :url => url)
|
1297
1474
|
if html
|
1298
1475
|
doc = Hpricot(html)
|
1299
1476
|
if doc
|
@@ -1310,7 +1487,7 @@ OUT
|
|
1310
1487
|
|
1311
1488
|
def get_website_below(download, url)
|
1312
1489
|
dwnl = get_optionvalue(:download, download)
|
1313
|
-
html = call_cmd(dwnl, [url])
|
1490
|
+
html = call_cmd(dwnl, [url], :url => url)
|
1314
1491
|
if html
|
1315
1492
|
doc = Hpricot(html)
|
1316
1493
|
if doc
|
@@ -1373,7 +1550,7 @@ OUT
|
|
1373
1550
|
def read_url(url, type='html')
|
1374
1551
|
downloader = get(url, "download_#{type}".intern)
|
1375
1552
|
if downloader
|
1376
|
-
call_cmd(downloader, [url])
|
1553
|
+
call_cmd(downloader, [url], :url => url)
|
1377
1554
|
else
|
1378
1555
|
read_url_openuri(url)
|
1379
1556
|
end
|
@@ -1421,10 +1598,12 @@ OUT
|
|
1421
1598
|
|
1422
1599
|
|
1423
1600
|
def format_rss_item(item, body, enclosure='')
|
1424
|
-
|
1425
|
-
|
1601
|
+
ti = rss_field(item, :title)
|
1602
|
+
au = rss_field(item, :author)
|
1603
|
+
hd = [ti]
|
1604
|
+
hd << " (#{au})" if au
|
1426
1605
|
return <<EOT
|
1427
|
-
<h2 class="rss"><a class="rss" href="#{item
|
1606
|
+
<h2 class="rss"><a class="rss" href="#{rss_field(item, :link)}">#{hd.join} -- #{rss_field(item, :pubDate)}</a></h2>
|
1428
1607
|
<div class="rss">
|
1429
1608
|
#{body}
|
1430
1609
|
#{enclosure}
|
@@ -1432,6 +1611,16 @@ OUT
|
|
1432
1611
|
EOT
|
1433
1612
|
end
|
1434
1613
|
|
1614
|
+
|
1615
|
+
def rss_field(item, field, default=nil)
|
1616
|
+
if item.respond_to?(field)
|
1617
|
+
return item.send(field)
|
1618
|
+
else
|
1619
|
+
return default
|
1620
|
+
end
|
1621
|
+
end
|
1622
|
+
|
1623
|
+
|
1435
1624
|
# Guess whether text is plain text or html.
|
1436
1625
|
def is_html?(text)
|
1437
1626
|
text =~ /<(div|a|span|body|html|script|p|table|td|tr|th|li|dt|br|hr|em|b)\b/
|
@@ -1524,7 +1713,7 @@ EOT
|
|
1524
1713
|
def file_url(filename)
|
1525
1714
|
# filename = File.join(File.basename(File.dirname(filename)), File.basename(filename))
|
1526
1715
|
# "file://#{encode(filename, ':/')}"
|
1527
|
-
filename = call_cmd(get_optionvalue(:global, :file_url), [filename], filename)
|
1716
|
+
filename = call_cmd(get_optionvalue(:global, :file_url), [filename], :default => filename)
|
1528
1717
|
encode(filename, ':/')
|
1529
1718
|
end
|
1530
1719
|
|
data/lib/websitary/htmldiff.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
# htmldiff.rb
|
3
|
-
# @Last Change: 2007-10
|
3
|
+
# @Last Change: 2007-11-10.
|
4
4
|
# Author:: Thomas Link (micathom at gmail com)
|
5
5
|
# License:: GPL (see http://www.gnu.org/licenses/gpl.txt)
|
6
6
|
# Created:: 2007-08-17.
|
@@ -17,7 +17,7 @@ module Websitary
|
|
17
17
|
# wrong results (especially wrong-negative) in certain occasions.
|
18
18
|
class Htmldiff
|
19
19
|
VERSION = '0.1'
|
20
|
-
REVISION = '
|
20
|
+
REVISION = '180'
|
21
21
|
|
22
22
|
# args:: A hash
|
23
23
|
# Fields:
|
@@ -30,6 +30,10 @@ module Websitary
|
|
30
30
|
@high = args[:highlight] || args[:highlightcolor]
|
31
31
|
@old = explode(args[:olddoc] || Hpricot(args[:oldtext] || File.read(args[:oldfile])))
|
32
32
|
@new = args[:newdoc] || Hpricot(args[:newtext] || File.read(args[:newfile]))
|
33
|
+
@ignore = args[:ignore]
|
34
|
+
if @ignore and !@ignore.kind_of?(Enumerable)
|
35
|
+
die "Ignore must be of kind Enumerable: #{ignore.inspect}"
|
36
|
+
end
|
33
37
|
@changed = false
|
34
38
|
end
|
35
39
|
|
@@ -46,11 +50,11 @@ module Websitary
|
|
46
50
|
# node, the whole node has changed. If only some sub-nodes have
|
47
51
|
# changed, collect those.
|
48
52
|
def process(node=@new)
|
49
|
-
acc
|
53
|
+
acc = []
|
50
54
|
node.each_child do |child|
|
51
55
|
ch = child.to_html.strip
|
52
56
|
next if ch.nil? or ch.empty?
|
53
|
-
if @old.include?(ch)
|
57
|
+
if @old.include?(ch) or ignore(child, ch)
|
54
58
|
if @high
|
55
59
|
acc << child
|
56
60
|
end
|
@@ -67,6 +71,20 @@ module Websitary
|
|
67
71
|
end
|
68
72
|
|
69
73
|
|
74
|
+
def ignore(node, node_as_string)
|
75
|
+
return @ignore && @ignore.any? do |i|
|
76
|
+
case i
|
77
|
+
when Regexp
|
78
|
+
node_as_string =~ i
|
79
|
+
when Proc
|
80
|
+
l.call(node)
|
81
|
+
else
|
82
|
+
die "Unknown type for ignore expression: #{i.inspect}"
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
|
70
88
|
# Collect all nodes and subnodes in a hpricot document.
|
71
89
|
def explode(node)
|
72
90
|
if node.respond_to?(:each_child)
|
metadata
CHANGED
@@ -1,33 +1,45 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.9.4
|
3
|
-
specification_version: 1
|
4
2
|
name: websitary
|
5
3
|
version: !ruby/object:Gem::Version
|
6
|
-
version: "0.
|
7
|
-
date: 2007-10-26 00:00:00 +02:00
|
8
|
-
summary: A unified website news, rss feed, podcast monitor
|
9
|
-
require_paths:
|
10
|
-
- lib
|
11
|
-
email: micathom at gmail com
|
12
|
-
homepage: http://rubyforge.org/projects/websitiary/
|
13
|
-
rubyforge_project: websitiary
|
14
|
-
description: "== DESCRIPTION: websitary (formerly known as websitiary with an extra \"i\") monitors webpages, rss feeds, podcasts etc. It reuses other programs (w3m, diff etc.) to do most of the actual work. By default, it works on an ASCII basis, i.e. with the output of text-based webbrowsers like w3m (or lynx, links etc.) as the output can easily be post-processed. It can also work with HTML and highlight new items. This script was originally planned as a ruby-based websec replacement. By default, this script will use w3m to dump HTML pages and then run diff over the current page and the previous backup. Some pages are better viewed with lynx or links. Downloaded documents (HTML or ASCII) can be post-processed (e.g., filtered through some ruby block that extracts elements via hpricot and the like). Please see the configuration options below to find out how to change this globally or for a single source. This user manual is also available as PDF[http://websitiary.rubyforge.org/websitary.pdf]. == FEATURES/PROBLEMS: * Handle webpages, rss feeds (optionally save attachments in podcasts etc.) * Compare webpages with previous backups * Display differences between the current version and the backup * Provide hooks to post-process the downloaded documents and the diff * Display a one-page report summarizing all news * Automatically open the report in your favourite web-browser * Experimental: Download webpages on defined intervalls and generate incremental diffs."
|
15
|
-
autorequire:
|
16
|
-
default_executable:
|
17
|
-
bindir: bin
|
18
|
-
has_rdoc: true
|
19
|
-
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
-
requirements:
|
21
|
-
- - ">"
|
22
|
-
- !ruby/object:Gem::Version
|
23
|
-
version: 0.0.0
|
24
|
-
version:
|
4
|
+
version: "0.4"
|
25
5
|
platform: ruby
|
26
|
-
signing_key:
|
27
|
-
cert_chain:
|
28
|
-
post_install_message:
|
29
6
|
authors:
|
30
7
|
- Thomas Link
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-01-13 00:00:00 +01:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: hpricot
|
17
|
+
version_requirement:
|
18
|
+
version_requirements: !ruby/object:Gem::Requirement
|
19
|
+
requirements:
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: "0"
|
23
|
+
version:
|
24
|
+
- !ruby/object:Gem::Dependency
|
25
|
+
name: hoe
|
26
|
+
version_requirement:
|
27
|
+
version_requirements: !ruby/object:Gem::Requirement
|
28
|
+
requirements:
|
29
|
+
- - ">="
|
30
|
+
- !ruby/object:Gem::Version
|
31
|
+
version: 1.4.0
|
32
|
+
version:
|
33
|
+
description: "== DESCRIPTION: websitary (formerly known as websitiary with an extra \"i\") monitors webpages, rss feeds, podcasts etc. It reuses other programs (w3m, diff etc.) to do most of the actual work. By default, it works on an ASCII basis, i.e. with the output of text-based webbrowsers like w3m (or lynx, links etc.) as the output can easily be post-processed. It can also work with HTML and highlight new items. This script was originally planned as a ruby-based websec replacement. By default, this script will use w3m to dump HTML pages and then run diff over the current page and the previous backup. Some pages are better viewed with lynx or links. Downloaded documents (HTML or ASCII) can be post-processed (e.g., filtered through some ruby block that extracts elements via hpricot and the like). Please see the configuration options below to find out how to change this globally or for a single source. This user manual is also available as PDF[http://websitiary.rubyforge.org/websitary.pdf]. == FEATURES/PROBLEMS: * Handle webpages, rss feeds (optionally save attachments in podcasts etc.) * Compare webpages with previous backups * Display differences between the current version and the backup * Provide hooks to post-process the downloaded documents and the diff * Display a one-page report summarizing all news * Automatically open the report in your favourite web-browser * Experimental: Download webpages on defined intervalls and generate incremental diffs."
|
34
|
+
email: micathom at gmail com
|
35
|
+
executables:
|
36
|
+
- websitary
|
37
|
+
extensions: []
|
38
|
+
|
39
|
+
extra_rdoc_files:
|
40
|
+
- History.txt
|
41
|
+
- Manifest.txt
|
42
|
+
- README.txt
|
31
43
|
files:
|
32
44
|
- History.txt
|
33
45
|
- Manifest.txt
|
@@ -40,37 +52,32 @@ files:
|
|
40
52
|
- lib/websitary/configuration.rb
|
41
53
|
- lib/websitary/filemtimes.rb
|
42
54
|
- lib/websitary/htmldiff.rb
|
43
|
-
|
44
|
-
|
55
|
+
has_rdoc: true
|
56
|
+
homepage: http://rubyforge.org/projects/websitiary/
|
57
|
+
post_install_message:
|
45
58
|
rdoc_options:
|
46
59
|
- --main
|
47
60
|
- README.txt
|
48
|
-
|
49
|
-
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
-
|
54
|
-
|
55
|
-
|
61
|
+
require_paths:
|
62
|
+
- lib
|
63
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
64
|
+
requirements:
|
65
|
+
- - ">="
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: "0"
|
68
|
+
version:
|
69
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
70
|
+
requirements:
|
71
|
+
- - ">="
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
version: "0"
|
74
|
+
version:
|
56
75
|
requirements: []
|
57
76
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
- !ruby/object:Gem::Version
|
66
|
-
version: 0.0.0
|
67
|
-
version:
|
68
|
-
- !ruby/object:Gem::Dependency
|
69
|
-
name: hoe
|
70
|
-
version_requirement:
|
71
|
-
version_requirements: !ruby/object:Gem::Version::Requirement
|
72
|
-
requirements:
|
73
|
-
- - ">="
|
74
|
-
- !ruby/object:Gem::Version
|
75
|
-
version: 1.3.0
|
76
|
-
version:
|
77
|
+
rubyforge_project: websitiary
|
78
|
+
rubygems_version: 1.0.1
|
79
|
+
signing_key:
|
80
|
+
specification_version: 2
|
81
|
+
summary: A unified website news, rss feed, podcast monitor
|
82
|
+
test_files: []
|
83
|
+
|