bwkfanboy 0.1.3 → 1.1.4

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -11,6 +11,10 @@ general assistance.
11
11
  than the whole gem on rubygems.org, so grab the source before
12
12
  struggling).
13
13
 
14
+ Plugins from version 1.1.4 are *incompatible* with the 0.1.x
15
+ series. Please reread in doc/plugin.rdoc the example of the skeleton
16
+ plugin.
17
+
14
18
  = Architecture
15
19
 
16
20
  == Plugins
@@ -18,10 +22,10 @@ struggling).
18
22
  bwkfanboy comes with several plugins. One of them, for example, parses a
19
23
  search page of dailyprincetonian.com looking for bwk's articles.
20
24
 
21
- The plugin is a Ruby class +Page+ that inherits Bwkfanboy::Parse
22
- parent, overriding 1 method.
25
+ The plugin is a Ruby class +Page+ that inherits Bwkfanboy::Parse parent,
26
+ overriding 1 method.
23
27
 
24
- The plugins can be in the system
28
+ Plugins can be in system
25
29
 
26
30
  `gem env gemdir`/gems/bwkfanboy-x.y.z/lib/bwkfanboy/plugins
27
31
 
@@ -89,7 +93,7 @@ There are 2 method to get an Atom feed via HTTP:
89
93
 
90
94
  2. Small *bwkfanboy_server* HTTP server. It can run from any user and
91
95
  thus is able to inherit env variables for discovering your HOME
92
- directory. Read bin/bwkfanboy_server to know how to operate it.
96
+ directory. Read doc/bwkfanboy_server.rdoc to know how to operate it.
93
97
 
94
98
  = License
95
99
  :include: doc/LICENSE
data/Rakefile CHANGED
@@ -9,12 +9,12 @@ require 'rake/testtask'
9
9
  spec = Gem::Specification.new() {|i|
10
10
  i.name = "bwkfanboy"
11
11
  i.summary = 'A converter from HTML to Atom feed that you can use to watch sites that do not provide its own feed.'
12
- i.version = '0.1.3'
12
+ i.version = `bin/#{i.name} -V`
13
13
  i.author = 'Alexander Gromnitsky'
14
14
  i.email = 'alexander.gromnitsky@gmail.com'
15
- i.homepage = 'http://github.com/gromnitsky/bwkfanboy'
15
+ i.homepage = "http://github.com/gromnitsky/#{i.name}"
16
16
  i.platform = Gem::Platform::RUBY
17
- i.required_ruby_version = '>= 1.9'
17
+ i.required_ruby_version = '>= 1.9.2'
18
18
  i.files = FileList['lib/**/*', 'bin/*', 'doc/*', '[A-Z]*', 'test/**/*']
19
19
 
20
20
  i.executables = FileList['bin/*'].gsub(/^bin\//, '')
@@ -22,10 +22,10 @@ spec = Gem::Specification.new() {|i|
22
22
 
23
23
  i.test_files = FileList['test/test_*.rb']
24
24
 
25
- i.rdoc_options << '-m' << 'Bwkfanboy' << '-x' << 'plugins'
26
- i.extra_rdoc_files = FileList['bin/*', 'doc/*']
25
+ i.rdoc_options << '-m' << 'doc/README.rdoc' << '-x' << 'plugins'
26
+ i.extra_rdoc_files = FileList['doc/*']
27
27
 
28
- i.add_dependency('activesupport', '>= 3.0.0')
28
+ i.add_dependency('activesupport', '>= 3.0.1')
29
29
  i.add_dependency('nokogiri', '>= 1.4.3')
30
30
  i.add_dependency('open4', '>= 1.0.1')
31
31
  i.add_dependency('jsonschema', '>= 2.0.0')
@@ -36,12 +36,11 @@ Rake::GemPackageTask.new(spec).define()
36
36
  task(default: %(repackage))
37
37
 
38
38
  Rake::RDocTask.new('doc') {|i|
39
- i.main = "Bwkfanboy"
40
- i.rdoc_files = FileList['doc/*', 'lib/**/*.rb', 'bin/*']
41
- i.rdoc_files.exclude("lib/**/plugins", "test")
39
+ i.main = 'doc/README.rdoc'
40
+ i.rdoc_files = FileList['doc/*', 'lib/**/*.rb']
41
+ i.rdoc_files.exclude("lib/**/plugins")
42
42
  }
43
43
 
44
44
  Rake::TestTask.new() {|i|
45
45
  i.test_files = FileList['test/test_*.rb']
46
- i.verbose = true
47
46
  }
data/bin/bwkfanboy CHANGED
@@ -1,4 +1,4 @@
1
- #!/usr/bin/env ruby19
1
+ #!/usr/bin/env ruby
2
2
  # -*-ruby-*-
3
3
 
4
4
  # This program is executed by bin/bwkfanboy_server to do all dirty work:
@@ -11,13 +11,14 @@
11
11
  #
12
12
  # % bwkfanboy -h
13
13
  #
14
- # to get some basic help & read about Bwkfanboy module.
14
+ # to get some basic help--read about Bwkfanboy module.
15
15
 
16
16
  require 'shellwords'
17
17
  require_relative '../lib/bwkfanboy/parser'
18
18
 
19
19
  $conf = {
20
- mode: 'pipe',
20
+ mode: 'fast',
21
+ debug: false,
21
22
  banner: "Usage: #{File.basename($0)} [options] plugin-name"
22
23
  }
23
24
 
@@ -87,7 +88,8 @@ o = Bwkfanboy::Utils.cl_parse(ARGV, $conf[:banner]) # create OptionParser object
87
88
  o.on('-i', 'Show some info about the plugin') { |i| $conf[:mode] = 'info' }
88
89
  o.on('-l', 'List all plugins') { |i| $conf[:mode] = 'list' }
89
90
  o.on('-p', 'List all plugins paths') { |i| $conf[:mode] = 'path' }
90
- o.on('-D', '(ignore this) Use URI_DEBUG const instead URI in plugins') { |i| $conf[:mode] = 'debug' }
91
+ o.on('-O', '(ignore this) Execute all bwkfanboy_* utils in a pipe') { |i| $conf[:mode] = 'pipe' }
92
+ o.on('-D', '(ignore this) Use URI_DEBUG const instead URI in plugins') { |i| $conf[:debug] = true }
91
93
  Bwkfanboy::Utils.cl_parse(ARGV, $conf[:banner], o) # run cl parser
92
94
 
93
95
  plugin = Plugin.new(ARGV[0])
@@ -107,12 +109,11 @@ when 'info'
107
109
  plugin.load(opt).dump_info
108
110
  when 'version'
109
111
  puts Bwkfanboy::Meta::VERSION
110
- else
111
- # A pipe mode
112
+ when 'pipe'
112
113
  pn = plugin.load(opt)
113
114
  cmd = "./bwkfanboy_fetch | ./bwkfanboy_parse '#{plugin.path}' #{opt.size != 0 ? Shellwords.join(opt) : ''} | ./bwkfanboy_generate"
114
115
  if Bwkfanboy::Utils.cfg[:verbose] >= 2 then
115
- puts ($conf[:mode] != 'debug' ? pn.uri() : pn.class::Meta::URI_DEBUG)
116
+ puts (!$conf[:debug] ? pn.uri() : pn.class::Meta::URI_DEBUG)
116
117
  puts cmd
117
118
  exit 0
118
119
  end
@@ -121,10 +122,31 @@ else
121
122
  Dir.chdir(File.dirname(File.expand_path($0)))
122
123
 
123
124
  pipe = IO.popen(cmd, 'w+')
124
- pipe.puts ($conf[:mode] != 'debug' ? pn.uri : pn.class::Meta::URI_DEBUG)
125
+ pipe.puts (!$conf[:debug] ? pn.uri : pn.class::Meta::URI_DEBUG)
125
126
  pipe.close_write
126
127
  while line = pipe.gets
127
128
  puts line
128
129
  end
129
130
  pipe.close
131
+
132
+ else
133
+ # a 'new', faster way
134
+ pn = plugin.load(opt)
135
+
136
+ require_relative '../lib/bwkfanboy/fetch'
137
+ require_relative '../lib/bwkfanboy/generate'
138
+
139
+ # 1. fetch & parse
140
+ pn = Page.new(opt)
141
+ Bwkfanboy::Fetch.cat(!$conf[:debug] ? pn.uri() : pn.class::Meta::URI_DEBUG) {|stream|
142
+ pn.parse(stream)
143
+ }
144
+
145
+ if Bwkfanboy::Utils.cfg[:verbose] >= 2 then
146
+ pn.dump()
147
+ exit 0
148
+ end
149
+
150
+ # 2.generate
151
+ puts Bwkfanboy::Generate.atom(JSON.parse(pn.to_json))
130
152
  end
data/bin/bwkfanboy_fetch CHANGED
@@ -1,30 +1,13 @@
1
- #!/usr/bin/env ruby19
1
+ #!/usr/bin/env ruby
2
2
  # -*-ruby-*-
3
3
 
4
- # Read stdin for a URI or a full path to the local file, download it (or
5
- # read for the local file) and print to stdout.
6
-
7
- require 'open-uri'
8
-
9
- require_relative '../lib/bwkfanboy/utils'
4
+ require_relative '../lib/bwkfanboy/fetch'
10
5
 
11
6
  $conf = { banner: "Usage: #{File.basename($0)} [options] < uri" }
12
7
 
13
8
  Bwkfanboy::Utils.cl_parse(ARGV, $conf[:banner], nil, true)
14
9
 
15
- uri = gets.chomp()
16
-
17
- Bwkfanboy::Utils.veputs(1, "fetching #{uri}\n")
18
-
19
- begin
20
- open(uri, "User-Agent" => Bwkfanboy::Meta::USER_AGENT) {|f|
21
- if defined?(f.meta) && f.status[0] != '200' then
22
- Bwkfanboy::Utils.errx(1, "cannot fetch #{uri} : HTTP responce: #{f.status[0]}")
23
- end
24
- Bwkfanboy::Utils.veputs(1, "charset = #{f.content_type_parse[1][1]}\n") if defined?(f.meta)
25
- f.each_line {|i| puts i}
26
- }
27
- rescue
28
- # typically Errno::ENOENT
29
- Bwkfanboy::Utils.errx(1, "cannot fetch: #{$!}");
30
- end
10
+ uri = gets
11
+ Bwkfanboy::Fetch.cat(uri) {|f|
12
+ f.each_line {|line| puts line }
13
+ }
@@ -1,24 +1,16 @@
1
- #!/usr/bin/env ruby19
1
+ #!/usr/bin/env ruby
2
2
  # -*-ruby-*-
3
3
 
4
- # Read stdin for JSON, generate from it an Atom feed and print the
5
- # result to stdout in UTF-8.
6
- #
7
- # One can validate the JSON by providing '--check' command line option
8
- # (by default the validating is off).
9
-
10
- require 'rss/maker'
11
- require 'date'
12
- require 'json'
13
- require 'jsonschema'
14
-
15
- require_relative '../lib/bwkfanboy/utils'
4
+ require_relative '../lib/bwkfanboy/generate'
16
5
 
17
6
  $conf = {
18
7
  banner: "Usage: #{File.basename($0)} [options] < json",
19
8
  check: false
20
9
  }
21
10
 
11
+ # we are expection the input ONLY in UTF-8
12
+ Encoding.default_external = 'UTF-8'
13
+
22
14
  o = Bwkfanboy::Utils.cl_parse(ARGV, $conf[:banner])
23
15
  o.on('--check', 'Validate the input (slow!)') { |i| $conf[:check] = true }
24
16
  Bwkfanboy::Utils.cl_parse(ARGV, $conf[:banner], o) # run cl parser
@@ -29,52 +21,5 @@ rescue
29
21
  Bwkfanboy::Utils.errx(1, "stdin had invalid JSON");
30
22
  end
31
23
 
32
- # validate the input
33
- schema = Bwkfanboy::Utils.gem_dir_system() + '/schema.js'
34
- if $conf[:check] then
35
- begin
36
- JSON::Schema.validate(j, JSON.parse(File.read(schema)))
37
- rescue
38
- Bwkfanboy::Utils.errx(1, "JSON validation with schema (#{schema}) failed");
39
- end
40
- end
41
-
42
- feed = RSS::Maker.make("atom") { |maker|
43
- maker.channel.id = j['channel']['id']
44
- maker.channel.updated = j['channel']['updated']
45
- maker.channel.author = j['channel']['author']
46
- maker.channel.title = j['channel']['title']
47
-
48
- maker.channel.links.new_link {|i|
49
- i.href = j['channel']['link']
50
- i.rel = 'alternate'
51
- i.type = 'text/html' # eh
52
- }
53
-
54
- maker.items.do_sort = true
55
-
56
- j['x_entries'].each { |i|
57
- maker.items.new_item do |item|
58
- item.links.new_link {|k|
59
- k.href = i['link']
60
- k.rel = 'alternate'
61
- k.type = 'text/html' # only to make happy crappy pr2nntp gateway
62
- }
63
- item.title = i['title']
64
- item.author = i['author']
65
- item.updated = i['updated']
66
- item.content.type = j['channel']['x_entries_content_type']
67
-
68
- case item.content.type
69
- when 'text'
70
- item.content.content = i['content']
71
- when 'html'
72
- item.content.content = i['content']
73
- else
74
- item.content.xhtml = i['content']
75
- end
76
- end
77
- }
78
- }
79
-
80
- puts feed
24
+ if $conf[:check] then Bwkfanboy::Generate.validate(j) end
25
+ puts Bwkfanboy::Generate.atom(j)
data/bin/bwkfanboy_parse CHANGED
@@ -1,12 +1,6 @@
1
- #!/usr/bin/env ruby19
1
+ #!/usr/bin/env ruby
2
2
  # -*-ruby-*-
3
3
 
4
- # Take 1 command line parameter: a full path to a plugin.
5
- #
6
- # Read stdin for a HTML, parse it and print the result to stdout in JSON
7
- # format. If '-vv' command line parameters were given, output will be in
8
- # 'key: value' pairs and <em>not</em> in JSON.
9
-
10
4
  require_relative '../lib/bwkfanboy/parser'
11
5
 
12
6
  $conf = {
@@ -19,12 +13,12 @@ if ARGV.size == 0 then
19
13
  abort($conf[:banner])
20
14
  else
21
15
  Bwkfanboy::Utils.plugin_load(ARGV[0], Bwkfanboy::Meta::PLUGIN_CLASS)
22
- end;
16
+ end
23
17
 
24
18
  opt = Bwkfanboy::Utils.plugin_opts(ARGV)
25
19
  pn = Page.new(opt)
26
20
  pn.check()
27
- pn.parse()
21
+ pn.parse(STDIN)
28
22
 
29
23
  if Bwkfanboy::Utils.cfg[:verbose] >= 2 then
30
24
  pn.dump()
data/bin/bwkfanboy_server CHANGED
@@ -1,42 +1,9 @@
1
- #!/usr/bin/env ruby19
1
+ #!/usr/bin/env ruby
2
2
  # -*-ruby-*-
3
3
 
4
- # Start a HTTP server (by default on 127.0.0.1:9042). To get Atom feeds
5
- # from it, initiate GET request with URI
6
- #
7
- # http://localhost:9042/?p=PLUGIN
8
- #
9
- # where +PLUGIN+ is a name of a bwkfanboy's plugin (without '.re' suffix).
10
- # If the plugin requires additional options you can specify them like:
11
- #
12
- # http://localhost:9042/?p=PLUGIN&o=opt1%20%22opt2%20has%20spaces%22
13
- #
14
- # where <tt>opt1%20%22opt2%20has%20spaces%22</tt> is a encoded string
15
- # <tt>opt1 "opt2 has spaces"</tt>.
16
- #
17
- # To list all available plugins, point you browser to
18
- #
19
- # http://localhost:9042/list
20
- #
21
- # The server is intended to run from a non-root user from
22
- # <tt>~/.login</tt> file. It can detach from a terminal if you give it
23
- # '-d' command line option.
24
- #
25
- # For other help, type:
26
- #
27
- # bwkfanboy_server -h
28
- #
29
- # The server maintains 2 logs:
30
- #
31
- # /tmp/bwkfanboy/USER/log/bwkfanboy_server.log
32
- # /tmp/bwkfanboy/USER/log/bwkfanboy_server-access.log
33
- #
34
- # The file with a pid:
35
- #
36
- # /tmp/bwkfanboy/USER/bwkfanboy_server.pid
37
-
38
4
  require 'shellwords'
39
5
  require 'webrick'
6
+ require 'date'
40
7
  require_relative '../lib/bwkfanboy/utils'
41
8
 
42
9
  $conf = {
@@ -78,8 +45,14 @@ class FeedServlet < WEBrick::HTTPServlet::AbstractServlet # :nodoc: all
78
45
  if r[0] != 0 then
79
46
  raise WEBrick::HTTPStatus::InternalServerError.new("Errors in the pipeline:\n\n #{r[1]}")
80
47
  end
81
-
48
+
82
49
  res.body = r[2]
50
+
51
+ # search for <updated> tag and set Last-Modified header
52
+ if (m = r[2].match('<updated>(.+?)</updated>'))
53
+ res['Last-Modified'] = DateTime.parse(m.to_s).httpdate
54
+ end
55
+
83
56
  else
84
57
  raise WEBrick::HTTPStatus::InternalServerError.new("Parameter 'p' required")
85
58
  end
data/doc/NEWS.rdoc CHANGED
@@ -1,6 +1,20 @@
1
- === Current
1
+ === 1.1.4
2
2
 
3
- - See git log.
3
+ - INCOMPATIBILITY: from now on, all plugins must do NOT read the stdin
4
+ but bwkfanboy's provided stream. See doc/plugins.rdoc.
5
+
6
+ - Moved the code from bwkfanboy_* to libraries.
7
+
8
+ - From now on, bwkfanboy util by default do not run utils in a pipe but
9
+ uses libraries directly. One can restore the old functionality with
10
+ '-O' CL swith.
11
+
12
+ === 0.1.4
13
+
14
+ - The minimum required Ruby version is 1.9.2.
15
+ - bwkfanboy_server now inserts a Last-Modified header.
16
+ - Fixed (?) a bug in bwkfanboy_generate with external encoding.
17
+ - Updated tests for Ruby 1.9.2.
4
18
 
5
19
  === 0.1.3
6
20
 
data/doc/README.rdoc CHANGED
@@ -11,6 +11,10 @@ general assistance.
11
11
  than the whole gem on rubygems.org, so grab the source before
12
12
  struggling).
13
13
 
14
+ Plugins from version 1.1.4 are *incompatible* with the 0.1.x
15
+ series. Please reread in doc/plugin.rdoc the example of the skeleton
16
+ plugin.
17
+
14
18
  = Architecture
15
19
 
16
20
  == Plugins
@@ -18,10 +22,10 @@ struggling).
18
22
  bwkfanboy comes with several plugins. One of them, for example, parses a
19
23
  search page of dailyprincetonian.com looking for bwk's articles.
20
24
 
21
- The plugin is a Ruby class +Page+ that inherits Bwkfanboy::Parse
22
- parent, overriding 1 method.
25
+ The plugin is a Ruby class +Page+ that inherits Bwkfanboy::Parse parent,
26
+ overriding 1 method.
23
27
 
24
- The plugins can be in the system
28
+ Plugins can be in system
25
29
 
26
30
  `gem env gemdir`/gems/bwkfanboy-x.y.z/lib/bwkfanboy/plugins
27
31
 
@@ -89,7 +93,7 @@ There are 2 method to get an Atom feed via HTTP:
89
93
 
90
94
  2. Small *bwkfanboy_server* HTTP server. It can run from any user and
91
95
  thus is able to inherit env variables for discovering your HOME
92
- directory. Read bin/bwkfanboy_server to know how to operate it.
96
+ directory. Read doc/bwkfanboy_server.rdoc to know how to operate it.
93
97
 
94
98
  = License
95
99
  :include: doc/LICENSE
@@ -0,0 +1,4 @@
1
+ = bwkfanboy_fetch
2
+
3
+ Read stdin for a URI or a full path to the local file, download it (or
4
+ read from the local file) and print the result to stdout.
@@ -0,0 +1,7 @@
1
+ = bwkfanboy_generate
2
+
3
+ Read stdin for JSON, generate from it an Atom feed and print the
4
+ result to stdout in UTF-8.
5
+
6
+ One can validate the JSON by providing '--check' command line option
7
+ (by default the validating is off).
@@ -0,0 +1,7 @@
1
+ = bwkfanboy_parse
2
+
3
+ Takes 1 command line parameter: a full path to a plugin.
4
+
5
+ Reads stdin for a HTML, parses it and prints the result to stdout in
6
+ JSON format. If '-vv' command line parameters were given, output will
7
+ be in 'key: value' pairs and <em>not</em> in JSON.
@@ -0,0 +1,35 @@
1
+ = bwkfanboy_server
2
+
3
+ Start a HTTP server (by default on 127.0.0.1:9042). To get Atom feeds
4
+ from it, initiate GET request with URI
5
+
6
+ http://localhost:9042/?p=PLUGIN
7
+
8
+ where +PLUGIN+ is a name of a bwkfanboy's plugin (without '.re' suffix).
9
+ If the plugin requires additional options you can specify them like:
10
+
11
+ http://localhost:9042/?p=PLUGIN&o=opt1%20%22opt2%20has%20spaces%22
12
+
13
+ where <tt>opt1%20%22opt2%20has%20spaces%22</tt> is a encoded string
14
+ <tt>opt1 "opt2 has spaces"</tt>.
15
+
16
+ To list all available plugins, point you browser to
17
+
18
+ http://localhost:9042/list
19
+
20
+ The server is intended to run from a non-root user from
21
+ <tt>~/.login</tt> file. It can detach from a terminal if you give it
22
+ '-d' command line option.
23
+
24
+ For other help, type:
25
+
26
+ bwkfanboy_server -h
27
+
28
+ The server maintains 2 logs:
29
+
30
+ /tmp/bwkfanboy/USER/log/bwkfanboy_server.log
31
+ /tmp/bwkfanboy/USER/log/bwkfanboy_server-access.log
32
+
33
+ The file with a pid:
34
+
35
+ /tmp/bwkfanboy/USER/bwkfanboy_server.pid
data/doc/plugin.rdoc CHANGED
@@ -1,12 +1,11 @@
1
- = HOWTO Write a \Plugin
1
+ = How to Write a \Plugin
2
2
 
3
3
  First of all, look at examples provided with bwkfanboy. They were
4
4
  intended to be 100% working because I was writing them for myself.
5
5
 
6
- Basically, all you need is to write a class named _Page_ that
7
- inherits this class Bwkfanboy::Parse, override in the child #myparse
8
- method and write a simple module named _Meta_ inside your _Page_
9
- class.
6
+ Basically, all you need is to write a class named _Page_ that inherits
7
+ class Bwkfanboy::Parse, override in the child \#myparse method and write
8
+ a simple module named _Meta_ inside your _Page_ class.
10
9
 
11
10
  == Skeleton
12
11
 
@@ -24,9 +23,9 @@ Here is a skeleton of a plugin:
24
23
  CONTENT_TYPE = 'html'
25
24
  end
26
25
 
27
- def myparse()
28
- # read stdin and parse it
29
- doc = Nokogiri::HTML(STDIN, nil, Meta::ENC)
26
+ def myparse(stream)
27
+ # read 'stream' IO object and parse it
28
+ doc = Nokogiri::HTML(stream, nil, Meta::ENC)
30
29
  doc.xpath("XPATH QUERY").each {|i|
31
30
  t = clean(i.xpath("XPATH QUERY").text())
32
31
  l = clean(i.xpath("XPATH QUERY").text())
@@ -52,8 +51,8 @@ it should.
52
51
 
53
52
  === \Meta
54
53
 
55
- Module _Meta_ can have only constants--and *all* constants listed in
56
- the skeleton are required.
54
+ Module _Meta_ can only have constants--and *all* constants listed in the
55
+ skeleton are mandatory.
57
56
 
58
57
  * <tt>URI</tt>--can be a <tt>http(s)://</tt> or <tt>ftp://</tt> URL
59
58
  or just a path to a file on your local machine, as
@@ -75,13 +74,17 @@ the skeleton are required.
75
74
 
76
75
  === myparse
77
76
 
78
- In #myparse method please read stdin. The contends of it is the raw
79
- HTML you want to parse. The general idea:
77
+ In \#myparse method read 'stream' IO object. The contents of it is the
78
+ raw HTML you want to parse. The general idea:
80
79
 
81
- * Atom feed must contain at least 1 entry, so look in HTML for some
82
- crap which you break into 5 peaces: title of the entry, link for
83
- it, a date for the entry, who is author of the entry and its
84
- contents.
80
+ * Atom feed must contain at least 1 entry, so look in the HTML for some
81
+ crap which you must break into 5 peaces:
82
+
83
+ - a title of the entry
84
+ - a link for it
85
+ - a date for the entry
86
+ - who is the author of the entry and
87
+ - its contents.
85
88
 
86
89
  * After you scan and grab 1 entry, create a hash and add it to
87
90
  _self_ as it was in the skeleton:
@@ -89,20 +92,20 @@ HTML you want to parse. The general idea:
89
92
  self << { title: t, link: l, updated: u, author: a, content: c }
90
93
 
91
94
  Here variables _t_, _l_, _u_, _a_ and _c_ contains the actual
92
- values of 5 peaces for the entry. Names of the keys in hash are
95
+ value of 5 peaces for the entry. Names of the keys in the hash are
93
96
  important of course--don't invent your own.
94
97
 
95
- * There would be probably more crap in HTML that you can use to
98
+ * Probably there would be more crap in the HTML that you can use to
96
99
  construct another entry. Keep parsing and adding entries.
97
100
 
98
- * While you scanning, use the 2 helper methods for cleaning each
99
- peace: \#clean, which removed duplicate spaces and #date, which
100
- parses a sting and return a date in ISO8601 format. You may
101
- override #date method if you like.
101
+ * While you're scanning, use the 2 helper methods for cleaning each
102
+ peace: \#clean, which removed duplicate spaces and \#date, which parses
103
+ a string and return a date in ISO8601 format. You may override \#date
104
+ method if you like.
102
105
 
103
106
  === Options
104
107
 
105
- Plugins can have _options_ and a user should provide then to the plugin
108
+ Plugins can have _options_ and a user should provide them to the plugin
106
109
  in the real-time. For example, say you're scraping a site where many
107
110
  users are wasting their time. If you want to watch for several of them
108
111
  it is silly to write a new plugin every time for a new
@@ -130,20 +133,20 @@ _option_ becomes mandatory for the end-user.
130
133
  To test how nice your plugin works, save the html page to the file
131
134
  and type:
132
135
 
133
- % bwkfanboy_parse -vv path/to/a/plugin.rb < saved_page.html
136
+ % bwkfanboy_parse -vv /path/to/the/plugin.rb < saved_page.html
134
137
 
135
138
  to see the result as in plain text, or
136
139
 
137
- % bwkfanboy_parse -v path/to/a/plugin.rb < saved_page.html
140
+ % bwkfanboy_parse -v /path/to/the/plugin.rb < saved_page.html
138
141
 
139
- as pretty JSON.
142
+ as a pretty JSON.
140
143
 
141
144
  For option-enabled plugins, supply additional parameters for them after
142
145
  the plugin path:
143
146
 
144
- % bwkfanboy_parse -vv path/to/a/plugin.rb \
145
- option_1 "options 2" < saved_page.html
147
+ % bwkfanboy_parse -vv /path/to/the/plugin.rb \
148
+ option_1 "option 2" < saved_page.html
146
149
 
147
150
  <tt>bwkfanboy_parse</tt> return 0 if no errors occurred or >= 1 if you
148
151
  have errors in your plugin code. N.B.: the output from
149
- <tt>bwkparser_parse</tt> is always in UTF-8.
152
+ <tt>bwkparser_parse</tt> *must* always be in UTF-8.
@@ -0,0 +1,36 @@
1
+ require 'open-uri'
2
+
3
+ require_relative 'utils'
4
+
5
+ module Bwkfanboy
6
+ class Fetch
7
+
8
+ # If no block given, return contents of fetch'ed URI. Otherwise,
9
+ # execute the block with 1 parameter--stream.
10
+ def self.cat(uri)
11
+ uri.chomp!
12
+
13
+ Bwkfanboy::Utils.veputs(1, "fetching #{uri}\n")
14
+
15
+ begin
16
+ open(uri, "User-Agent" => Bwkfanboy::Meta::USER_AGENT) {|f|
17
+ if defined?(f.meta) && f.status[0] != '200' then
18
+ Bwkfanboy::Utils.errx(1, "cannot fetch #{uri} : HTTP responce: #{f.status[0]}")
19
+ end
20
+ Bwkfanboy::Utils.veputs(1, "charset = #{f.content_type_parse[1][1]}\n") if defined?(f.meta)
21
+ if block_given?
22
+ yield f
23
+ else
24
+ return f.read
25
+ end
26
+ }
27
+ rescue
28
+ # typically Errno::ENOENT
29
+ Bwkfanboy::Utils.errx(1, "cannot fetch: #{$!}");
30
+ end
31
+
32
+ return ""
33
+ end
34
+
35
+ end
36
+ end
@@ -0,0 +1,63 @@
1
+ require 'rss/maker'
2
+ require 'date'
3
+ require 'json'
4
+ require 'jsonschema'
5
+
6
+ require_relative 'utils'
7
+
8
+ module Bwkfanboy
9
+ class Generate
10
+
11
+ def self.validate(t)
12
+ schema = Bwkfanboy::Utils.gem_dir_system() + '/schema.js'
13
+ begin
14
+ JSON::Schema.validate(t, JSON.parse(File.read(schema)))
15
+ rescue
16
+ Bwkfanboy::Utils.errx(1, "JSON validation with schema (#{schema}) failed");
17
+ end
18
+ end
19
+
20
+ def self.atom(src)
21
+ feed = RSS::Maker.make("atom") { |maker|
22
+ maker.channel.id = src['channel']['id']
23
+ maker.channel.updated = src['channel']['updated']
24
+ maker.channel.author = src['channel']['author']
25
+ maker.channel.title = src['channel']['title']
26
+
27
+ maker.channel.links.new_link {|i|
28
+ i.href = src['channel']['link']
29
+ i.rel = 'alternate'
30
+ i.type = 'text/html' # eh
31
+ }
32
+
33
+ maker.items.do_sort = true
34
+
35
+ src['x_entries'].each { |i|
36
+ maker.items.new_item do |item|
37
+ item.links.new_link {|k|
38
+ k.href = i['link']
39
+ k.rel = 'alternate'
40
+ k.type = 'text/html' # only to make happy crappy pr2nntp gateway
41
+ }
42
+ item.title = i['title']
43
+ item.author = i['author']
44
+ item.updated = i['updated']
45
+ item.content.type = src['channel']['x_entries_content_type']
46
+
47
+ case item.content.type
48
+ when 'text'
49
+ item.content.content = i['content']
50
+ when 'html'
51
+ item.content.content = i['content']
52
+ else
53
+ item.content.xhtml = i['content']
54
+ end
55
+ end
56
+ }
57
+ }
58
+
59
+ return feed
60
+ end
61
+
62
+ end
63
+ end
@@ -8,7 +8,7 @@ module Bwkfanboy
8
8
 
9
9
  # :include: ../../doc/plugin.rdoc
10
10
  class Parse
11
- ENTRIES_MAX = 64
11
+ ENTRIES_MAX = 128
12
12
 
13
13
  attr_reader :opt
14
14
 
@@ -18,10 +18,10 @@ module Bwkfanboy
18
18
  end
19
19
 
20
20
  # Invokes #myparse & checks if it has grabbed something.
21
- def parse()
21
+ def parse(stream)
22
22
  @entries = []
23
23
  begin
24
- myparse()
24
+ myparse(stream)
25
25
  rescue
26
26
  @entries = []
27
27
  Utils.errx(1, "parsing failed: #{$!}\n\nBacktrace:\n\n#{$!.backtrace.join("\n")}")
@@ -99,7 +99,7 @@ module Bwkfanboy
99
99
  protected
100
100
 
101
101
  # This *must* be overridden in the child.
102
- def myparse()
102
+ def myparse(stream)
103
103
  raise "plugin isn't finished yet"
104
104
  end
105
105
 
@@ -8,16 +8,16 @@ class Page < Bwkfanboy::Parse
8
8
  URI = 'http://www.dailyprincetonian.com/advanced_search/?author=Brian+Kernighan'
9
9
  URI_DEBUG = '/home/alex/lib/software/alex/bwkfanboy/test/semis/bwk.html'
10
10
  ENC = 'UTF-8'
11
- VERSION = 1
11
+ VERSION = 2
12
12
  COPYRIGHT = "See bwkfanboy's LICENSE file"
13
13
  TITLE = "Brian Kernighan's articles from Daily Princetonian"
14
14
  CONTENT_TYPE = 'html'
15
15
  end
16
16
 
17
- def myparse()
17
+ def myparse(stream)
18
18
  url = "http://www.dailyprincetonian.com"
19
19
 
20
- doc = Nokogiri::HTML(STDIN, nil, Meta::ENC)
20
+ doc = Nokogiri::HTML(stream, nil, Meta::ENC)
21
21
  doc.xpath("//div[@class='article_item']").each {|i|
22
22
  t = clean(i.xpath("h2/a").children.text())
23
23
  fail 'unable to extract link' if (link = clean(i.xpath("h2/a")[0].attributes['href'].value()).empty?)
@@ -5,7 +5,7 @@ class Page < Bwkfanboy::Parse
5
5
  URI = '/usr/ports/UPDATING'
6
6
  URI_DEBUG = URI
7
7
  ENC = 'ASCII'
8
- VERSION = 1
8
+ VERSION = 2
9
9
  COPYRIGHT = "See bwkfanboy's LICENSE file"
10
10
  TITLE = "News from FreeBSD ports"
11
11
  CONTENT_TYPE = 'text'
@@ -24,7 +24,7 @@ class Page < Bwkfanboy::Parse
24
24
  return t
25
25
  end
26
26
 
27
- def myparse()
27
+ def myparse(stream)
28
28
  re_u = /^(\d{8}):$/
29
29
  re_t1 = /^ {2}AFFECTS:\s+(.+)$/
30
30
  re_t2 = /^\s+(.+)$/
@@ -33,7 +33,7 @@ class Page < Bwkfanboy::Parse
33
33
  ready = false
34
34
  mode = nil
35
35
  t = l = u = a = c = nil
36
- while line = STDIN.gets
36
+ while line = stream.gets
37
37
  line.rstrip!
38
38
 
39
39
  if line =~ re_u then
@@ -17,17 +17,17 @@ class Page < Bwkfanboy::Parse
17
17
  URI = 'http://www.quora.com/#{opt[0]}/answers'
18
18
  URI_DEBUG = '/home/alex/lib/software/alex/bwkfanboy/test/semis/quora.html'
19
19
  ENC = 'UTF-8'
20
- VERSION = 4
20
+ VERSION = 5
21
21
  COPYRIGHT = "See bwkfanboy's LICENSE file"
22
22
  TITLE = "Last n answers (per-user) from Quora; requires nodejs"
23
23
  CONTENT_TYPE = 'html'
24
24
  end
25
25
 
26
- def myparse()
26
+ def myparse(stream)
27
27
  profile = opt[0] # for example, 'Brandon-Smietana'
28
28
 
29
29
  # read stdin
30
- doc = Nokogiri::HTML(STDIN, nil, Meta::ENC)
30
+ doc = Nokogiri::HTML(stream, nil, Meta::ENC)
31
31
 
32
32
  # extract & evaluate JavaScript into tstp
33
33
  tstp = nil
@@ -7,7 +7,7 @@ require 'active_support/core_ext/module/attribute_accessors'
7
7
  module Bwkfanboy
8
8
  module Meta
9
9
  NAME = 'bwkfanboy'
10
- VERSION = '0.1.3'
10
+ VERSION = '1.1.4'
11
11
  USER_AGENT = "#{NAME}/#{VERSION} (#{RUBY_PLATFORM}; N; #{Encoding.default_external.name}; #{RUBY_ENGINE}; rv:#{RUBY_VERSION}.#{RUBY_PATCHLEVEL})"
12
12
  PLUGIN_CLASS = 'Page'
13
13
  DIR_TMP = "/tmp/#{Meta::NAME}/#{ENV['USER']}"
@@ -89,7 +89,7 @@ module Bwkfanboy
89
89
  # TODO get rid of eval()
90
90
  fail "class #{class_name} isn't defined" if (! eval("defined?#{class_name}") || ! eval(class_name).is_a?(Class) )
91
91
  rescue LoadError
92
- errx(1, "cannot load plugin '#{path}'");
92
+ errx(1, "cannot load plugin '#{path}' #{$!}");
93
93
  rescue Exception
94
94
  errx(1, "plugin '#{path}' has errors: #{$!}\n\nBacktrace:\n\n#{$!.backtrace.join("\n")}")
95
95
  end
data/test/test_fetch.rb CHANGED
@@ -1,10 +1,7 @@
1
- #!/usr/bin/env ruby19
2
-
3
- require 'minitest/autorun'
4
1
  require 'digest/md5'
5
2
 
6
3
  require_relative '../lib/bwkfanboy/utils'
7
- require_relative 'ts_utils.rb'
4
+ require_relative 'ts_utils'
8
5
 
9
6
  # TODO add HTTP 404 check; drop connection from server during HTTP 200
10
7
  # replay...
@@ -1,6 +1,3 @@
1
- #!/usr/bin/env ruby19
2
-
3
- require 'minitest/autorun'
4
1
  require 'digest/md5'
5
2
 
6
3
  require_relative '../lib/bwkfanboy/utils'
data/test/test_parse.rb CHANGED
@@ -1,6 +1,3 @@
1
- #!/usr/bin/env ruby19
2
-
3
- require 'minitest/autorun'
4
1
  require 'digest/md5'
5
2
 
6
3
  require_relative '../lib/bwkfanboy/utils'
@@ -17,16 +14,16 @@ class TestParse < MiniTest::Unit::TestCase
17
14
 
18
15
  def test_empty_plugin
19
16
  cmd CMD
20
- r = Bwkfanboy::Utils.cmd_run("#{cmd CMD} #{@tpath}plugins/empty.rb ")
17
+ r = Bwkfanboy::Utils.cmd_run("#{cmd CMD} #{Dir.pwd}/#{@tpath}plugins/empty.rb ")
21
18
  assert_equal(1, r[0])
22
19
  assert_match(/plugin .+ has errors: class Page isn't defined/, r[1])
23
20
  end
24
21
 
25
22
  def test_plugin_parse
26
23
  cmd CMD
27
- r = Bwkfanboy::Utils.cmd_run("#{cmd CMD} #{@tpath}plugins/bwk.rb < #{@tpath}semis/bwk.html")
24
+ r = Bwkfanboy::Utils.cmd_run("#{cmd CMD} #{Dir.pwd}/#{@tpath}../lib/bwkfanboy/plugins/bwk.rb < #{@tpath}semis/bwk.html")
28
25
  assert_equal(0, r[0])
29
- # bin/bwkfanboy_parse test/plugins/bwk.rb < test/semis/bwk.html | md5
30
- assert_equal('371fb5a5c5b5519b5eff085df2d31e18', Digest::MD5.hexdigest(r[2]))
26
+ # bin/bwkfanboy_parse `pwd`/lib/bwkfanboy/plugins/bwk.rb < test/semis/bwk.html | md5
27
+ assert_equal('a433a4a27bafb060a41aa85a40808056', Digest::MD5.hexdigest(r[2]))
31
28
  end
32
29
  end
data/test/test_server.rb CHANGED
@@ -1,6 +1,3 @@
1
- #!/usr/bin/env ruby19
2
-
3
- require 'minitest/autorun'
4
1
  require 'open-uri'
5
2
  require 'digest/md5'
6
3
 
data/test/ts_utils.rb CHANGED
@@ -1,3 +1,11 @@
1
+ # don't run test automatically
2
+ # if they were invoked as 'gem check -t ...'
3
+ if $0 =~ /gem/
4
+ require 'minitest/unit'
5
+ else
6
+ require 'minitest/autorun'
7
+ end
8
+
1
9
  # return the right directory for _c_
2
10
  def cmd(c)
3
11
  @tpath = ''
metadata CHANGED
@@ -3,10 +3,10 @@ name: bwkfanboy
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
- - 0
7
6
  - 1
8
- - 3
9
- version: 0.1.3
7
+ - 1
8
+ - 4
9
+ version: 1.1.4
10
10
  platform: ruby
11
11
  authors:
12
12
  - Alexander Gromnitsky
@@ -14,27 +14,29 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-10-29 00:00:00 +03:00
17
+ date: 2010-11-08 00:00:00 +02:00
18
18
  default_executable: bwkfanboy
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  name: activesupport
22
22
  prerelease: false
23
23
  requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
24
25
  requirements:
25
26
  - - ">="
26
27
  - !ruby/object:Gem::Version
27
28
  segments:
28
29
  - 3
29
30
  - 0
30
- - 0
31
- version: 3.0.0
31
+ - 1
32
+ version: 3.0.1
32
33
  type: :runtime
33
34
  version_requirements: *id001
34
35
  - !ruby/object:Gem::Dependency
35
36
  name: nokogiri
36
37
  prerelease: false
37
38
  requirement: &id002 !ruby/object:Gem::Requirement
39
+ none: false
38
40
  requirements:
39
41
  - - ">="
40
42
  - !ruby/object:Gem::Version
@@ -49,6 +51,7 @@ dependencies:
49
51
  name: open4
50
52
  prerelease: false
51
53
  requirement: &id003 !ruby/object:Gem::Requirement
54
+ none: false
52
55
  requirements:
53
56
  - - ">="
54
57
  - !ruby/object:Gem::Version
@@ -63,6 +66,7 @@ dependencies:
63
66
  name: jsonschema
64
67
  prerelease: false
65
68
  requirement: &id004 !ruby/object:Gem::Requirement
69
+ none: false
66
70
  requirements:
67
71
  - - ">="
68
72
  - !ruby/object:Gem::Version
@@ -84,15 +88,14 @@ executables:
84
88
  extensions: []
85
89
 
86
90
  extra_rdoc_files:
87
- - bin/bwkfanboy_generate
88
- - bin/bwkfanboy_parse
89
- - bin/bwkfanboy
90
- - bin/bwkfanboy_server
91
- - bin/bwkfanboy_fetch
92
- - doc/plugin.rdoc
93
- - doc/README.rdoc
94
91
  - doc/LICENSE
95
92
  - doc/NEWS.rdoc
93
+ - doc/README.rdoc
94
+ - doc/plugin.rdoc
95
+ - doc/bwkfanboy_fetch.rdoc
96
+ - doc/bwkfanboy_generate.rdoc
97
+ - doc/bwkfanboy_parse.rdoc
98
+ - doc/bwkfanboy_server.rdoc
96
99
  files:
97
100
  - lib/bwkfanboy/plugins/bwk.rb
98
101
  - lib/bwkfanboy/plugins/freebsd-ports-update.rb
@@ -101,20 +104,25 @@ files:
101
104
  - lib/bwkfanboy/parser.rb
102
105
  - lib/bwkfanboy/utils.rb
103
106
  - lib/bwkfanboy/schema.js
107
+ - lib/bwkfanboy/fetch.rb
108
+ - lib/bwkfanboy/generate.rb
104
109
  - bin/bwkfanboy_generate
105
110
  - bin/bwkfanboy_parse
106
111
  - bin/bwkfanboy
107
112
  - bin/bwkfanboy_server
108
113
  - bin/bwkfanboy_fetch
109
- - doc/plugin.rdoc
110
- - doc/README.rdoc
111
114
  - doc/LICENSE
112
115
  - doc/NEWS.rdoc
116
+ - doc/README.rdoc
117
+ - doc/plugin.rdoc
118
+ - doc/bwkfanboy_fetch.rdoc
119
+ - doc/bwkfanboy_generate.rdoc
120
+ - doc/bwkfanboy_parse.rdoc
121
+ - doc/bwkfanboy_server.rdoc
113
122
  - README.rdoc
114
123
  - Rakefile
115
124
  - TODO
116
125
  - test/plugins/empty.rb
117
- - test/plugins/bwk.rb
118
126
  - test/semis/bwk.html
119
127
  - test/semis/bwk.json
120
128
  - test/semis/quora.html
@@ -134,20 +142,23 @@ licenses: []
134
142
  post_install_message:
135
143
  rdoc_options:
136
144
  - -m
137
- - Bwkfanboy
145
+ - doc/README.rdoc
138
146
  - -x
139
147
  - plugins
140
148
  require_paths:
141
149
  - lib
142
150
  required_ruby_version: !ruby/object:Gem::Requirement
151
+ none: false
143
152
  requirements:
144
153
  - - ">="
145
154
  - !ruby/object:Gem::Version
146
155
  segments:
147
156
  - 1
148
157
  - 9
149
- version: "1.9"
158
+ - 2
159
+ version: 1.9.2
150
160
  required_rubygems_version: !ruby/object:Gem::Requirement
161
+ none: false
151
162
  requirements:
152
163
  - - ">="
153
164
  - !ruby/object:Gem::Version
@@ -157,7 +168,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
157
168
  requirements: []
158
169
 
159
170
  rubyforge_project:
160
- rubygems_version: 1.3.6
171
+ rubygems_version: 1.3.7
161
172
  signing_key:
162
173
  specification_version: 3
163
174
  summary: A converter from HTML to Atom feed that you can use to watch sites that do not provide its own feed.
data/test/plugins/bwk.rb DELETED
@@ -1,29 +0,0 @@
1
- require 'nokogiri'
2
-
3
- class Page < Bwkfanboy::Parse
4
- module Meta
5
- URI = "html/bwk.html"
6
- ENC = 'UTF-8'
7
- VERSION = 1
8
- COPYRIGHT = '(c) 2010 Alexander Gromnitsky'
9
- TITLE = "Brian Kernighan's articles from Daily Princetonian"
10
- CONTENT_TYPE = 'html'
11
- end
12
-
13
- def myparse()
14
- url = "http://www.dailyprincetonian.com"
15
-
16
- doc = Nokogiri::HTML(STDIN, nil, Meta::ENC)
17
- doc.xpath("//div[@class='article_item']").each {|i|
18
- t = clean(i.xpath("h2/a").children.text())
19
- fail 'unable to extract link' if (link = clean(i.xpath("h2/a")[0].attributes['href'].value()).empty?)
20
- link = clean(i.xpath("h2/a")[0].attributes['href'].value())
21
- l = url + link + "print"
22
- u = date(i.xpath("h2").children[1].text())
23
- a = clean(i.xpath("div/span/a[1]").children.text())
24
- c = clean(i.xpath("div[@class='summary']").text())
25
-
26
- self << { title: t, link: l, updated: u, author: a, content: c }
27
- }
28
- end
29
- end