websitary 0.3 → 0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,14 @@
1
+ = 0.4
2
+
3
+ * Sources may have a :timeout option.
4
+ * exclude: Argument can be a string or a regexp.
5
+ * htmldiff: :ignore option to exclude certain nodes from the diff.
6
+ * Left-mouse clicks make items collapse/expand.
7
+ * iconv: Support for converting encodings (require the per-url iconv
8
+ option to be set).
9
+ * exclude mailto urls.
10
+
11
+
1
12
  = 0.3
2
13
 
3
14
  * Renamed the global option :downloadhtml to :download_html.
data/README.txt CHANGED
@@ -189,9 +189,13 @@ This is the same a <tt>option :global, OPTION => VALUE</tt>.
189
189
 
190
190
  Known global options:
191
191
 
192
- <tt>:filename_size => N</tt>::
193
- The max filename size. If a filename becomes longer, md5 encoding will
194
- be used for local copies in the cache.
192
+ <tt>:canonic_filename => BLOCK(FILENAME)</tt>::
193
+ Rewrite filenames as they are stored in the mtimes register. This may
194
+ useful if you want to use the same repository on several computers
195
+ with in different locations etc.
196
+
197
+ <tt>:encoding => OUTPUT_DOCUMENT_ENCODING</tt>::
198
+ The default is 'ISO-8859-1'.
195
199
 
196
200
  <tt>:downloadhtml => SHORTCUT</tt>::
197
201
  The default shortcut for downloading plain HTML.
@@ -201,10 +205,12 @@ Known global options:
201
205
  copies in the output. This may useful if you want to use the same
202
206
  repository on several computers with in different locations etc.
203
207
 
204
- <tt>:canonic_filename => BLOCK(FILENAME)</tt>::
205
- Rewrite filenames as they are stored in the mtimes register. This may
206
- useful if you want to use the same repository on several computers
207
- with in different locations etc.
208
+ <tt>:filename_size => N</tt>::
209
+ The max filename size. If a filename becomes longer, md5 encoding will
210
+ be used for local copies in the cache.
211
+
212
+ <tt>:toggle_body => BOOLEAN</tt>::
213
+ If true, make a news body collabsable on mouse-clicks (sort of).
208
214
 
209
215
 
210
216
  ==== output_format FORMAT, output_format [FORMAT1, FORMAT2, ...]
@@ -270,6 +276,14 @@ Options
270
276
  wraps the output in +pre+ tags. :webdiff, :body_html, :website_below,
271
277
  :website, and :openuri will simply add a newline character.
272
278
 
279
+ <tt>:iconv => ENCODING</tt>::
280
+ If set, use iconv to convert the page body into the summary's document
281
+ encoding (see the 'global' section). Websitary currently isn't able to
282
+ automatically determine and convert encodings.
283
+
284
+ <tt>:timeout => SECONDS</tt>::
285
+ When using openuri, download the page with a timeout.
286
+
273
287
  <tt>:hours => HOURS, :days => DAYS</tt>::
274
288
  Don't download the file unless it's older than that
275
289
 
@@ -733,5 +747,3 @@ along with this program; if not, write to the Free Software
733
747
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
734
748
  USA
735
749
 
736
-
737
- % vi: ft=rd:tw=72:ts=4
data/Rakefile CHANGED
@@ -21,7 +21,11 @@ require 'rtagstask'
21
21
  RTagsTask.new
22
22
 
23
23
  task :ctags do
24
- `ctags --extra=+q --fields=+i -R bin lib`
24
+ `ctags --extra=+q --fields=+i+S -R bin lib`
25
+ end
26
+
27
+ task :files do
28
+ `find bin lib -name "*.rb" > files.lst`
25
29
  end
26
30
 
27
31
  # vim: syntax=Ruby
@@ -1,6 +1,6 @@
1
- #! /usr/bin/ruby.exe
1
+ #! /usr/bin/env ruby
2
2
  # websitary.rb -- The website news, rss feed, podcast catching monitor
3
- # @Last Change: 2007-09-09.
3
+ # @Last Change: 2007-12-26.
4
4
  # Author:: Thomas Link (micathom at gmail com)
5
5
  # License:: GPL (see http://www.gnu.org/licenses/gpl.txt)
6
6
  # Created:: 2007-06-09.
@@ -1,5 +1,5 @@
1
1
  # websitary.rb
2
- # @Last Change: 2007-10-26.
2
+ # @Last Change: 2008-01-13.
3
3
  # Author:: Thomas Link (micathom AT gmail com)
4
4
  # License:: GPL (see http://www.gnu.org/licenses/gpl.txt)
5
5
  # Created:: 2007-09-08.
@@ -14,6 +14,7 @@ require 'pathname'
14
14
  require 'rbconfig'
15
15
  require 'uri'
16
16
  require 'open-uri'
17
+ require 'timeout'
17
18
  require 'yaml'
18
19
  require 'rss'
19
20
 
@@ -32,8 +33,8 @@ end
32
33
 
33
34
  module Websitary
34
35
  APPNAME = 'websitary'
35
- VERSION = '0.3'
36
- REVISION = '2437'
36
+ VERSION = '0.4'
37
+ REVISION = '2464'
37
38
  end
38
39
 
39
40
  require 'websitary/applog'
@@ -71,92 +72,7 @@ class Websitary::App
71
72
  unless File.exists?(css)
72
73
  $logger.info "Copying default css file: #{css}"
73
74
  @configuration.write_file(css, 'w') do |io|
74
- io.puts <<CSS
75
- body {
76
- color: black;
77
- background-color: #f0f0f0;
78
- }
79
- a.external {
80
- }
81
- a.old {
82
- }
83
- a.latest {
84
- }
85
- a.toc {
86
- }
87
- ol.toc {
88
- float: left;
89
- width: 200px;
90
- position: fixed;
91
- padding: 0;
92
- margin: 0;
93
- }
94
- li.toc {
95
- list-style: none;
96
- border: 1px solid #e0e0e0;
97
- background-color: #fafafa;
98
- padding: 0.1em;
99
- font-size: 80%;
100
- font-family: Verdana, Myriad Web, Syntax, sans-serif;
101
- }
102
- li.toc:hover {
103
- background-color: #ffff8d;
104
- }
105
- div.contents {
106
- margin-left: 210px;
107
- min-width: 16em;
108
- }
109
- div.webpage {
110
- margin: 5px 0 5px 0;
111
- padding: 5px;
112
- border: 1px solid #e0e0e0;
113
- background-color: white;
114
- }
115
- div.count {
116
- text-align: right;
117
- }
118
- .enclosure {
119
- padding: 4px;
120
- margin: 4px 0 4px 0;
121
- background: #f9f9f9;
122
- }
123
- h1.diff {
124
- font-family: Verdana, Myriad Web, Syntax, sans-serif;
125
- }
126
- h2.rss {
127
- border-top: 10px solid #f0f0f0;
128
- padding-top: 10px;
129
- }
130
- div.diff {
131
- padding-left: 2em;
132
- }
133
- pre.diff {
134
- padding-left: 2em;
135
- }
136
- div.annotation {
137
- font-size: 80%;
138
- }
139
- hr.separator {
140
- width: 100%;
141
- visibility: hidden;
142
- }
143
- .error {
144
- color: yellow;
145
- background-color: red;
146
- }
147
- .highlight-yellow {
148
- background-color: #ffc730;
149
- }
150
- .highlight-red {
151
- background-color: red;
152
- }
153
- .highlight-blue {
154
- background-color: blue;
155
- }
156
- .highlight-aqua {
157
- background-color: aqua;
158
- }
159
- CSS
75
+ io.puts @configuration.get_option(:page, :css)
160
76
  end
161
77
  end
162
78
  end
@@ -318,7 +234,7 @@ CSS
318
234
  difftext.delete('')
319
235
  unless difftext.empty?
320
236
  joindiffs = @configuration.get(url, :joindiffs, lambda {|t| t.join("\n")})
321
- difftext = @configuration.call_cmd(joindiffs, [difftext]) if joindiffs
237
+ difftext = @configuration.call_cmd(joindiffs, [difftext], :url => url) if joindiffs
322
238
  accumulate(url, difftext, opts)
323
239
  end
324
240
  aggrfiles.each do |file|
@@ -437,7 +353,7 @@ CSS
437
353
 
438
354
  $logger.warn "Download: #{@configuration.get(url, :title, url).inspect}"
439
355
  @configuration.done << url
440
- text = @configuration.call_cmd(@configuration.get(url, :download), [url])
356
+ text = @configuration.call_cmd(@configuration.get(url, :download), [url], :url => url)
441
357
  # $logger.debug text #DBG#
442
358
  unless text
443
359
  $logger.warn "no contents: #{@configuration.get(url, :title, url)}"
@@ -477,7 +393,7 @@ CSS
477
393
  pprc = @configuration.get(url, :downloadprocess)
478
394
  if pprc
479
395
  $logger.debug "download process: #{pprc}"
480
- text = @configuration.call_cmd(pprc, [text])
396
+ text = @configuration.call_cmd(pprc, [text], :url => url)
481
397
  # $logger.debug text #DBG#
482
398
  end
483
399
 
@@ -500,13 +416,13 @@ CSS
500
416
  def diff(url, opts, new, old)
501
417
  if File.exists?(old)
502
418
  $logger.debug "diff: #{old} <-> #{new}"
503
- difftext = @configuration.call_cmd(@configuration.get(url, :diff), [old, new])
419
+ difftext = @configuration.call_cmd(@configuration.get(url, :diff), [old, new], :url => url)
504
420
  # $logger.debug "diff: #{difftext}" #DBG#
505
421
 
506
422
  if difftext =~ /\S/
507
423
  if (pprc = @configuration.get(url, :diffprocess))
508
424
  $logger.debug "diff process: #{pprc}"
509
- difftext = @configuration.call_cmd(pprc, [difftext])
425
+ difftext = @configuration.call_cmd(pprc, [difftext], :url => url)
510
426
  end
511
427
  # $logger.debug "difftext: #{difftext}" #DBG#
512
428
  if difftext =~ /\S/
@@ -514,7 +430,7 @@ CSS
514
430
  return difftext
515
431
  end
516
432
  end
517
-
433
+
518
434
  $logger.debug "Unchanged: #{@configuration.get(url, :title, url).inspect}"
519
435
 
520
436
  elsif File.exist?(new) and
@@ -1,5 +1,5 @@
1
1
  # configuration.rb
2
- # @Last Change: 2007-10-21.
2
+ # @Last Change: 2008-01-09.
3
3
  # Author:: Thomas Link (micathom AT gmail com)
4
4
  # License:: GPL (see http://www.gnu.org/licenses/gpl.txt)
5
5
  # Created:: 2007-09-08.
@@ -129,7 +129,7 @@ class Websitary::Configuration
129
129
  end
130
130
 
131
131
  opts.on('-x', '--exclude=N', Regexp, 'Exclude URLs matching this pattern') do |value|
132
- exclude(value)
132
+ exclude(Regexp.new(value))
133
133
  end
134
134
 
135
135
  opts.separator ''
@@ -337,9 +337,14 @@ class Websitary::Configuration
337
337
 
338
338
 
339
339
  def to_do(url)
340
- unless @exclude.any? {|p| url =~ p}
341
- @todo << url
342
- end
340
+ @todo << url unless is_excluded?(url)
341
+ end
342
+
343
+
344
+ def is_excluded?(url)
345
+ rv = @exclude.any? {|p| url =~ p}
346
+ $logger.debug "is_excluded: #{url}: #{rv}"
347
+ rv
343
348
  end
344
349
 
345
350
 
@@ -434,9 +439,19 @@ class Websitary::Configuration
434
439
 
435
440
 
436
441
  # Configuration command:
437
- # Add URL-exclusion patterns (REGEXPs).
442
+ # Add URL-exclusion patterns (REGEXPs or STRINGs).
438
443
  def exclude(*urls)
439
- @exclude += urls
444
+ @exclude += urls.map do |url|
445
+ case url
446
+ when Regexp
447
+ url
448
+ when String
449
+ Regexp.new(Regexp.escape(url))
450
+ else
451
+ $logger.fatal "Must be regexp or string: #{url.inspect}"
452
+ exit 5
453
+ end
454
+ end
440
455
  end
441
456
 
442
457
 
@@ -461,10 +476,26 @@ class Websitary::Configuration
461
476
  end
462
477
 
463
478
 
479
+ def format_text(url, text)
480
+ enc = get(url, :iconv)
481
+ if enc
482
+ denc = get_optionvalue(:global, :encoding)
483
+ begin
484
+ require 'iconv'
485
+ text = Iconv.conv(denc, enc, text)
486
+ rescue Exception => e
487
+ $logger.error "IConv failed #{enc} => #{denc}: #{e}"
488
+ end
489
+ end
490
+ return text
491
+ end
492
+
493
+
464
494
  # Format a diff according to URL's source options.
465
495
  def format(url, difftext)
466
- fmt = get(url, :format)
467
- eval_arg(fmt, [difftext], difftext)
496
+ fmt = get(url, :format)
497
+ text = format_text(url, difftext)
498
+ eval_arg(fmt, [text], text)
468
499
  end
469
500
 
470
501
 
@@ -493,8 +524,22 @@ class Websitary::Configuration
493
524
 
494
525
  # Apply the argument to cmd (a format String or a Proc). If a
495
526
  # String, execute the command.
496
- def call_cmd(cmd, args, default=nil)
497
- eval_arg(cmd, args, default) {|cmd| `#{cmd}`}
527
+ def call_cmd(cmd, cmdargs, args={})
528
+ default = args[:default]
529
+ url = args[:url]
530
+ timeout = url ? get(url, :timeout) : nil
531
+ if timeout
532
+ begin
533
+ Timeout::timeout(timeout) do |timeout_length|
534
+ eval_arg(cmd, cmdargs, default) {|cmd| `#{cmd}`}
535
+ end
536
+ rescue Timeout::Error
537
+ $logger.error "Timeout #{timeout}: #{url}"
538
+ return default
539
+ end
540
+ else
541
+ eval_arg(cmd, cmdargs, default) {|cmd| `#{cmd}`}
542
+ end
498
543
  end
499
544
 
500
545
 
@@ -630,15 +675,17 @@ class Websitary::Configuration
630
675
  ext = %{ (#{old}, #{lst})}
631
676
  urlr = url
632
677
  end
633
- note = difftext_annotation(url)
678
+ note = difftext_annotation(url)
679
+ onclick = get_optionvalue(:global, :toggle_body) ? 'onclick="ToggleBody(this)"' : ''
634
680
  <<HTML
635
- <div id="#{bid}" class="webpage">
681
+ <div id="#{bid}" class="webpage" #{onclick}>
636
682
  <div class="count">
637
683
  #{idx}
638
684
  </div>
639
685
  <h1 class="diff">
640
- <a class="external" href="#{urlr}">#{ti}</a>#{ext}
686
+ <a class="external" href="#{urlr}">#{format_text(url, ti)}</a>#{ext}
641
687
  </h1>
688
+ <div id="#{bid}_body">
642
689
  <div class="annotation">
643
690
  #{note && CGI::escapeHTML(note)}
644
691
  </div>
@@ -646,6 +693,7 @@ class Websitary::Configuration
646
693
  #{format(url, text)}
647
694
  </div>
648
695
  </div>
696
+ </div>
649
697
  HTML
650
698
  end.join(('<hr class="separator"/>') + "\n")
651
699
 
@@ -795,7 +843,8 @@ HTML
795
843
  # already included.
796
844
  def push_hrefs(url, hpricot, &condition)
797
845
  begin
798
- return if robots?(hpricot, 'nofollow')
846
+ $logger.debug "push_refs: #{url}"
847
+ return if robots?(hpricot, 'nofollow') or is_excluded?(url)
799
848
  depth = get(url, :depth)
800
849
  return if depth and depth <= 0
801
850
  uri0 = URI.parse(url)
@@ -804,8 +853,8 @@ HTML
804
853
  (hpricot / 'a').each do |a|
805
854
  next if a['rel'] == 'nofollow'
806
855
  href = a['href']
807
- next if href.nil? or href == url or href =~ /^\s*javascript:/
808
- uri = URI.parse(href)
856
+ next if href.nil? or href == url or href =~ /^\s*javascript:/ or href =~ /^\s*mailto:/ or is_excluded?(href)
857
+ uri = URI.parse(href)
809
858
  pn = guess_dir(uri.path)
810
859
  href = rewrite_href(href, url, uri0, pn0, true)
811
860
  curl = canonic_url(href)
@@ -838,17 +887,33 @@ HTML
838
887
  uri = URI.parse(url)
839
888
  urd = guess_dir(uri.path)
840
889
  (doc / 'a').each do |a|
841
- href = rewrite_href(a['href'], url, uri, urd, true)
842
- a['href'] = href if href
890
+ href = a['href']
891
+ if is_excluded?(href)
892
+ comment_element(doc, a)
893
+ else
894
+ href = rewrite_href(href, url, uri, urd, true)
895
+ a['href'] = href if href
896
+ end
843
897
  end
844
898
  (doc / 'img').each do |a|
845
- href = rewrite_href(a['src'], url, uri, urd, false)
846
- a['src'] = href if href
899
+ href = a['src']
900
+ if is_excluded?(href)
901
+ comment_element(doc, a)
902
+ else
903
+ href = rewrite_href(href, url, uri, urd, false)
904
+ a['src'] = href if href
905
+ end
847
906
  end
848
907
  doc
849
908
  end
850
909
 
851
910
 
911
+ def comment_element(doc, elt)
912
+ doc.insert_before(elt, '<!-- WEBSITARY: ')
913
+ doc.insert_after(elt, '-->')
914
+ end
915
+
916
+
852
917
  # Try to make href an absolute url.
853
918
  def rewrite_href(href, url, uri=nil, urd=nil, local=false)
854
919
  begin
@@ -961,7 +1026,7 @@ HTML
961
1026
 
962
1027
 
963
1028
  def canonic_filename(filename)
964
- call_cmd(get_optionvalue(:global, :canonic_filename), [filename], filename)
1029
+ call_cmd(get_optionvalue(:global, :canonic_filename), [filename], :default => filename)
965
1030
  end
966
1031
 
967
1032
 
@@ -970,6 +1035,8 @@ HTML
970
1035
  @options = {
971
1036
  :global => {
972
1037
  :download_html => :openuri,
1038
+ :encoding => 'ISO-8859-1',
1039
+ :toggle_body => false,
973
1040
  },
974
1041
  }
975
1042
 
@@ -996,9 +1063,13 @@ HTML
996
1063
  :raw => :new,
997
1064
 
998
1065
  :htmldiff => lambda {|old, new|
999
- oldhtml = File.read(old)
1000
- newhtml = File.read(new)
1001
- difftext = Websitary::Htmldiff.new(:oldtext => oldhtml, :newtext => newhtml).diff
1066
+ url = url_from_filename(new)
1067
+ args = {
1068
+ :oldhtml => File.read(old),
1069
+ :newhtml => File.read(new),
1070
+ :ignore => get(url, :ignore),
1071
+ }
1072
+ difftext = Websitary::Htmldiff.new(args).diff
1002
1073
  difftext
1003
1074
  },
1004
1075
 
@@ -1130,7 +1201,8 @@ HTML
1130
1201
  rss_diff = Websitary::Htmldiff.new(:highlight => 'highlight', :oldtext => olditem.description, :newtext => item.description).process
1131
1202
  rnew << format_rss_item(item, rss_diff)
1132
1203
  else
1133
- if item.enclosure and (curl = item.enclosure.url)
1204
+ enc = item.respond_to?(:enclosure) && item.enclosure
1205
+ if enc and (curl = enc.url)
1134
1206
  url = url_from_filename(new)
1135
1207
  dir = get(url, :rss_enclosure)
1136
1208
  curl = rewrite_href(curl, url, nil, nil, true)
@@ -1229,15 +1301,31 @@ HTML
1229
1301
  }
1230
1302
 
1231
1303
  @options[:page] = {
1232
- :format => lambda do |ti, li, bd|
1304
+ :format => lambda {|ti, li, bd|
1233
1305
  template = <<OUT
1234
1306
  <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
1235
1307
  <html>
1236
1308
  <head>
1237
1309
  <title>%s</title>
1310
+ <meta http-equiv="Content-Type" content="text/html; charset=#{get_optionvalue(:global, :encoding)}">
1238
1311
  <link rel="stylesheet" href="websitary.css" type="text/css">
1239
1312
  <link rel="alternate" href="websitary.rss" type="application/rss+xml" title="%s">
1240
1313
  </head>
1314
+ <script type="text/javascript">
1315
+ function ToggleBody(Item) {
1316
+ var Body = document.getElementById(Item.id + "_body");
1317
+ if (Body.style.visibility == "collapse") {
1318
+ Body.style.visibility = "visible";
1319
+ Body.style.height = "";
1320
+ Item.style.background = "";
1321
+ } else {
1322
+ Body.style.visibility = "collapse";
1323
+ Body.style.height = "1px";
1324
+ Item.style.background = "#e0f0f0";
1325
+ }
1326
+ return '';
1327
+ }
1328
+ </script>
1241
1329
  <body>
1242
1330
  <ol class="toc">
1243
1331
  %s
@@ -1249,7 +1337,96 @@ HTML
1249
1337
  </html>
1250
1338
  OUT
1251
1339
  template % [ti, ti, li, bd]
1252
- end
1340
+ },
1341
+ :css => <<CSS,
1342
+ body {
1343
+ color: black;
1344
+ background-color: #f0f0f0;
1345
+ }
1346
+ a.external {
1347
+ }
1348
+ a.old {
1349
+ }
1350
+ a.latest {
1351
+ }
1352
+ a.toc {
1353
+ }
1354
+ ol.toc {
1355
+ float: left;
1356
+ width: 200px;
1357
+ position: fixed;
1358
+ padding: 0;
1359
+ margin: 0;
1360
+ }
1361
+ li.toc {
1362
+ list-style: none;
1363
+ border: 1px solid #e0e0e0;
1364
+ background-color: #fafafa;
1365
+ padding: 0.1em;
1366
+ font-size: 80%;
1367
+ font-family: Verdana, Myriad Web, Syntax, sans-serif;
1368
+ }
1369
+ li.toc:hover {
1370
+ background-color: #ffff8d;
1371
+ }
1372
+ div.contents {
1373
+ margin-left: 210px;
1374
+ min-width: 16em;
1375
+ }
1376
+ div.webpage {
1377
+ margin: 5px 0 5px 0;
1378
+ padding: 5px;
1379
+ border: 1px solid #e0e0e0;
1380
+ background-color: white;
1381
+ }
1382
+ div.count {
1383
+ text-align: right;
1384
+ }
1385
+ .enclosure {
1386
+ padding: 4px;
1387
+ margin: 4px 0 4px 0;
1388
+ background: #f9f9f9;
1389
+ }
1390
+ h1.diff {
1391
+ font-family: Verdana, Myriad Web, Syntax, sans-serif;
1392
+ }
1393
+ h2.rss {
1394
+ border-top: 10px solid #f0f0f0;
1395
+ padding-top: 10px;
1396
+ }
1397
+ div.diff {
1398
+ padding-left: 2em;
1399
+ }
1400
+ pre.diff {
1401
+ padding-left: 2em;
1402
+ }
1403
+ div.annotation {
1404
+ font-size: 80%;
1405
+ }
1406
+ hr.separator {
1407
+ width: 100%;
1408
+ visibility: hidden;
1409
+ }
1410
+ .error {
1411
+ color: yellow;
1412
+ background-color: red;
1413
+ }
1414
+ .highlight {
1415
+ background-color: #fac751;
1416
+ }
1417
+ .highlight-yellow {
1418
+ background-color: #ffc730;
1419
+ }
1420
+ .highlight-red {
1421
+ background-color: red;
1422
+ }
1423
+ .highlight-blue {
1424
+ background-color: blue;
1425
+ }
1426
+ .highlight-aqua {
1427
+ background-color: aqua;
1428
+ }
1429
+ CSS
1253
1430
  }
1254
1431
  end
1255
1432
 
@@ -1293,7 +1470,7 @@ OUT
1293
1470
 
1294
1471
 
1295
1472
  def get_website(download, url)
1296
- html = call_cmd(get_optionvalue(:download, download), [url])
1473
+ html = call_cmd(get_optionvalue(:download, download), [url], :url => url)
1297
1474
  if html
1298
1475
  doc = Hpricot(html)
1299
1476
  if doc
@@ -1310,7 +1487,7 @@ OUT
1310
1487
 
1311
1488
  def get_website_below(download, url)
1312
1489
  dwnl = get_optionvalue(:download, download)
1313
- html = call_cmd(dwnl, [url])
1490
+ html = call_cmd(dwnl, [url], :url => url)
1314
1491
  if html
1315
1492
  doc = Hpricot(html)
1316
1493
  if doc
@@ -1373,7 +1550,7 @@ OUT
1373
1550
  def read_url(url, type='html')
1374
1551
  downloader = get(url, "download_#{type}".intern)
1375
1552
  if downloader
1376
- call_cmd(downloader, [url])
1553
+ call_cmd(downloader, [url], :url => url)
1377
1554
  else
1378
1555
  read_url_openuri(url)
1379
1556
  end
@@ -1421,10 +1598,12 @@ OUT
1421
1598
 
1422
1599
 
1423
1600
  def format_rss_item(item, body, enclosure='')
1424
- hd = [item.title]
1425
- hd << " (#{item.author})" if item.author
1601
+ ti = rss_field(item, :title)
1602
+ au = rss_field(item, :author)
1603
+ hd = [ti]
1604
+ hd << " (#{au})" if au
1426
1605
  return <<EOT
1427
- <h2 class="rss"><a class="rss" href="#{item.link}">#{hd.join} -- #{item.pubDate}</a></h2>
1606
+ <h2 class="rss"><a class="rss" href="#{rss_field(item, :link)}">#{hd.join} -- #{rss_field(item, :pubDate)}</a></h2>
1428
1607
  <div class="rss">
1429
1608
  #{body}
1430
1609
  #{enclosure}
@@ -1432,6 +1611,16 @@ OUT
1432
1611
  EOT
1433
1612
  end
1434
1613
 
1614
+
1615
+ def rss_field(item, field, default=nil)
1616
+ if item.respond_to?(field)
1617
+ return item.send(field)
1618
+ else
1619
+ return default
1620
+ end
1621
+ end
1622
+
1623
+
1435
1624
  # Guess whether text is plain text or html.
1436
1625
  def is_html?(text)
1437
1626
  text =~ /<(div|a|span|body|html|script|p|table|td|tr|th|li|dt|br|hr|em|b)\b/
@@ -1524,7 +1713,7 @@ EOT
1524
1713
  def file_url(filename)
1525
1714
  # filename = File.join(File.basename(File.dirname(filename)), File.basename(filename))
1526
1715
  # "file://#{encode(filename, ':/')}"
1527
- filename = call_cmd(get_optionvalue(:global, :file_url), [filename], filename)
1716
+ filename = call_cmd(get_optionvalue(:global, :file_url), [filename], :default => filename)
1528
1717
  encode(filename, ':/')
1529
1718
  end
1530
1719
 
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
  # htmldiff.rb
3
- # @Last Change: 2007-10-08.
3
+ # @Last Change: 2007-11-10.
4
4
  # Author:: Thomas Link (micathom at gmail com)
5
5
  # License:: GPL (see http://www.gnu.org/licenses/gpl.txt)
6
6
  # Created:: 2007-08-17.
@@ -17,7 +17,7 @@ module Websitary
17
17
  # wrong results (especially wrong-negative) in certain occasions.
18
18
  class Htmldiff
19
19
  VERSION = '0.1'
20
- REVISION = '164'
20
+ REVISION = '180'
21
21
 
22
22
  # args:: A hash
23
23
  # Fields:
@@ -30,6 +30,10 @@ module Websitary
30
30
  @high = args[:highlight] || args[:highlightcolor]
31
31
  @old = explode(args[:olddoc] || Hpricot(args[:oldtext] || File.read(args[:oldfile])))
32
32
  @new = args[:newdoc] || Hpricot(args[:newtext] || File.read(args[:newfile]))
33
+ @ignore = args[:ignore]
34
+ if @ignore and !@ignore.kind_of?(Enumerable)
35
+ die "Ignore must be of kind Enumerable: #{ignore.inspect}"
36
+ end
33
37
  @changed = false
34
38
  end
35
39
 
@@ -46,11 +50,11 @@ module Websitary
46
50
  # node, the whole node has changed. If only some sub-nodes have
47
51
  # changed, collect those.
48
52
  def process(node=@new)
49
- acc = []
53
+ acc = []
50
54
  node.each_child do |child|
51
55
  ch = child.to_html.strip
52
56
  next if ch.nil? or ch.empty?
53
- if @old.include?(ch)
57
+ if @old.include?(ch) or ignore(child, ch)
54
58
  if @high
55
59
  acc << child
56
60
  end
@@ -67,6 +71,20 @@ module Websitary
67
71
  end
68
72
 
69
73
 
74
+ def ignore(node, node_as_string)
75
+ return @ignore && @ignore.any? do |i|
76
+ case i
77
+ when Regexp
78
+ node_as_string =~ i
79
+ when Proc
80
+ l.call(node)
81
+ else
82
+ die "Unknown type for ignore expression: #{i.inspect}"
83
+ end
84
+ end
85
+ end
86
+
87
+
70
88
  # Collect all nodes and subnodes in a hpricot document.
71
89
  def explode(node)
72
90
  if node.respond_to?(:each_child)
metadata CHANGED
@@ -1,33 +1,45 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.9.4
3
- specification_version: 1
4
2
  name: websitary
5
3
  version: !ruby/object:Gem::Version
6
- version: "0.3"
7
- date: 2007-10-26 00:00:00 +02:00
8
- summary: A unified website news, rss feed, podcast monitor
9
- require_paths:
10
- - lib
11
- email: micathom at gmail com
12
- homepage: http://rubyforge.org/projects/websitiary/
13
- rubyforge_project: websitiary
14
- description: "== DESCRIPTION: websitary (formerly known as websitiary with an extra \"i\") monitors webpages, rss feeds, podcasts etc. It reuses other programs (w3m, diff etc.) to do most of the actual work. By default, it works on an ASCII basis, i.e. with the output of text-based webbrowsers like w3m (or lynx, links etc.) as the output can easily be post-processed. It can also work with HTML and highlight new items. This script was originally planned as a ruby-based websec replacement. By default, this script will use w3m to dump HTML pages and then run diff over the current page and the previous backup. Some pages are better viewed with lynx or links. Downloaded documents (HTML or ASCII) can be post-processed (e.g., filtered through some ruby block that extracts elements via hpricot and the like). Please see the configuration options below to find out how to change this globally or for a single source. This user manual is also available as PDF[http://websitiary.rubyforge.org/websitary.pdf]. == FEATURES/PROBLEMS: * Handle webpages, rss feeds (optionally save attachments in podcasts etc.) * Compare webpages with previous backups * Display differences between the current version and the backup * Provide hooks to post-process the downloaded documents and the diff * Display a one-page report summarizing all news * Automatically open the report in your favourite web-browser * Experimental: Download webpages on defined intervalls and generate incremental diffs."
15
- autorequire:
16
- default_executable:
17
- bindir: bin
18
- has_rdoc: true
19
- required_ruby_version: !ruby/object:Gem::Version::Requirement
20
- requirements:
21
- - - ">"
22
- - !ruby/object:Gem::Version
23
- version: 0.0.0
24
- version:
4
+ version: "0.4"
25
5
  platform: ruby
26
- signing_key:
27
- cert_chain:
28
- post_install_message:
29
6
  authors:
30
7
  - Thomas Link
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-01-13 00:00:00 +01:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hpricot
17
+ version_requirement:
18
+ version_requirements: !ruby/object:Gem::Requirement
19
+ requirements:
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: "0"
23
+ version:
24
+ - !ruby/object:Gem::Dependency
25
+ name: hoe
26
+ version_requirement:
27
+ version_requirements: !ruby/object:Gem::Requirement
28
+ requirements:
29
+ - - ">="
30
+ - !ruby/object:Gem::Version
31
+ version: 1.4.0
32
+ version:
33
+ description: "== DESCRIPTION: websitary (formerly known as websitiary with an extra \"i\") monitors webpages, rss feeds, podcasts etc. It reuses other programs (w3m, diff etc.) to do most of the actual work. By default, it works on an ASCII basis, i.e. with the output of text-based webbrowsers like w3m (or lynx, links etc.) as the output can easily be post-processed. It can also work with HTML and highlight new items. This script was originally planned as a ruby-based websec replacement. By default, this script will use w3m to dump HTML pages and then run diff over the current page and the previous backup. Some pages are better viewed with lynx or links. Downloaded documents (HTML or ASCII) can be post-processed (e.g., filtered through some ruby block that extracts elements via hpricot and the like). Please see the configuration options below to find out how to change this globally or for a single source. This user manual is also available as PDF[http://websitiary.rubyforge.org/websitary.pdf]. == FEATURES/PROBLEMS: * Handle webpages, rss feeds (optionally save attachments in podcasts etc.) * Compare webpages with previous backups * Display differences between the current version and the backup * Provide hooks to post-process the downloaded documents and the diff * Display a one-page report summarizing all news * Automatically open the report in your favourite web-browser * Experimental: Download webpages on defined intervalls and generate incremental diffs."
34
+ email: micathom at gmail com
35
+ executables:
36
+ - websitary
37
+ extensions: []
38
+
39
+ extra_rdoc_files:
40
+ - History.txt
41
+ - Manifest.txt
42
+ - README.txt
31
43
  files:
32
44
  - History.txt
33
45
  - Manifest.txt
@@ -40,37 +52,32 @@ files:
40
52
  - lib/websitary/configuration.rb
41
53
  - lib/websitary/filemtimes.rb
42
54
  - lib/websitary/htmldiff.rb
43
- test_files: []
44
-
55
+ has_rdoc: true
56
+ homepage: http://rubyforge.org/projects/websitiary/
57
+ post_install_message:
45
58
  rdoc_options:
46
59
  - --main
47
60
  - README.txt
48
- extra_rdoc_files:
49
- - History.txt
50
- - Manifest.txt
51
- - README.txt
52
- executables:
53
- - websitary
54
- extensions: []
55
-
61
+ require_paths:
62
+ - lib
63
+ required_ruby_version: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: "0"
68
+ version:
69
+ required_rubygems_version: !ruby/object:Gem::Requirement
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ version: "0"
74
+ version:
56
75
  requirements: []
57
76
 
58
- dependencies:
59
- - !ruby/object:Gem::Dependency
60
- name: hpricot
61
- version_requirement:
62
- version_requirements: !ruby/object:Gem::Version::Requirement
63
- requirements:
64
- - - ">"
65
- - !ruby/object:Gem::Version
66
- version: 0.0.0
67
- version:
68
- - !ruby/object:Gem::Dependency
69
- name: hoe
70
- version_requirement:
71
- version_requirements: !ruby/object:Gem::Version::Requirement
72
- requirements:
73
- - - ">="
74
- - !ruby/object:Gem::Version
75
- version: 1.3.0
76
- version:
77
+ rubyforge_project: websitiary
78
+ rubygems_version: 1.0.1
79
+ signing_key:
80
+ specification_version: 2
81
+ summary: A unified website news, rss feed, podcast monitor
82
+ test_files: []
83
+