bwkfanboy 0.1.3 → 1.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc CHANGED
@@ -11,6 +11,10 @@ general assistance.
11
11
  than the whole gem on rubygems.org, so grab the source before
12
12
  struggling).
13
13
 
14
+ Plugins from version 1.1.4 are *incompatible* with the 0.1.x
15
+ series. Please reread in doc/plugin.rdoc the example of the skeleton
16
+ plugin.
17
+
14
18
  = Architecture
15
19
 
16
20
  == Plugins
@@ -18,10 +22,10 @@ struggling).
18
22
  bwkfanboy comes with several plugins. One of them, for example, parses a
19
23
  search page of dailyprincetonian.com looking for bwk's articles.
20
24
 
21
- The plugin is a Ruby class +Page+ that inherits Bwkfanboy::Parse
22
- parent, overriding 1 method.
25
+ The plugin is a Ruby class +Page+ that inherits Bwkfanboy::Parse parent,
26
+ overriding 1 method.
23
27
 
24
- The plugins can be in the system
28
+ Plugins can be in system
25
29
 
26
30
  `gem env gemdir`/gems/bwkfanboy-x.y.z/lib/bwkfanboy/plugins
27
31
 
@@ -89,7 +93,7 @@ There are 2 method to get an Atom feed via HTTP:
89
93
 
90
94
  2. Small *bwkfanboy_server* HTTP server. It can run from any user and
91
95
  thus is able to inherit env variables for discovering your HOME
92
- directory. Read bin/bwkfanboy_server to know how to operate it.
96
+ directory. Read doc/bwkfanboy_server.rdoc to know how to operate it.
93
97
 
94
98
  = License
95
99
  :include: doc/LICENSE
data/Rakefile CHANGED
@@ -9,12 +9,12 @@ require 'rake/testtask'
9
9
  spec = Gem::Specification.new() {|i|
10
10
  i.name = "bwkfanboy"
11
11
  i.summary = 'A converter from HTML to Atom feed that you can use to watch sites that do not provide its own feed.'
12
- i.version = '0.1.3'
12
+ i.version = `bin/#{i.name} -V`
13
13
  i.author = 'Alexander Gromnitsky'
14
14
  i.email = 'alexander.gromnitsky@gmail.com'
15
- i.homepage = 'http://github.com/gromnitsky/bwkfanboy'
15
+ i.homepage = "http://github.com/gromnitsky/#{i.name}"
16
16
  i.platform = Gem::Platform::RUBY
17
- i.required_ruby_version = '>= 1.9'
17
+ i.required_ruby_version = '>= 1.9.2'
18
18
  i.files = FileList['lib/**/*', 'bin/*', 'doc/*', '[A-Z]*', 'test/**/*']
19
19
 
20
20
  i.executables = FileList['bin/*'].gsub(/^bin\//, '')
@@ -22,10 +22,10 @@ spec = Gem::Specification.new() {|i|
22
22
 
23
23
  i.test_files = FileList['test/test_*.rb']
24
24
 
25
- i.rdoc_options << '-m' << 'Bwkfanboy' << '-x' << 'plugins'
26
- i.extra_rdoc_files = FileList['bin/*', 'doc/*']
25
+ i.rdoc_options << '-m' << 'doc/README.rdoc' << '-x' << 'plugins'
26
+ i.extra_rdoc_files = FileList['doc/*']
27
27
 
28
- i.add_dependency('activesupport', '>= 3.0.0')
28
+ i.add_dependency('activesupport', '>= 3.0.1')
29
29
  i.add_dependency('nokogiri', '>= 1.4.3')
30
30
  i.add_dependency('open4', '>= 1.0.1')
31
31
  i.add_dependency('jsonschema', '>= 2.0.0')
@@ -36,12 +36,11 @@ Rake::GemPackageTask.new(spec).define()
36
36
  task(default: %(repackage))
37
37
 
38
38
  Rake::RDocTask.new('doc') {|i|
39
- i.main = "Bwkfanboy"
40
- i.rdoc_files = FileList['doc/*', 'lib/**/*.rb', 'bin/*']
41
- i.rdoc_files.exclude("lib/**/plugins", "test")
39
+ i.main = 'doc/README.rdoc'
40
+ i.rdoc_files = FileList['doc/*', 'lib/**/*.rb']
41
+ i.rdoc_files.exclude("lib/**/plugins")
42
42
  }
43
43
 
44
44
  Rake::TestTask.new() {|i|
45
45
  i.test_files = FileList['test/test_*.rb']
46
- i.verbose = true
47
46
  }
data/bin/bwkfanboy CHANGED
@@ -1,4 +1,4 @@
1
- #!/usr/bin/env ruby19
1
+ #!/usr/bin/env ruby
2
2
  # -*-ruby-*-
3
3
 
4
4
  # This program is executed by bin/bwkfanboy_server to do all dirty work:
@@ -11,13 +11,14 @@
11
11
  #
12
12
  # % bwkfanboy -h
13
13
  #
14
- # to get some basic help & read about Bwkfanboy module.
14
+ # to get some basic help--read about Bwkfanboy module.
15
15
 
16
16
  require 'shellwords'
17
17
  require_relative '../lib/bwkfanboy/parser'
18
18
 
19
19
  $conf = {
20
- mode: 'pipe',
20
+ mode: 'fast',
21
+ debug: false,
21
22
  banner: "Usage: #{File.basename($0)} [options] plugin-name"
22
23
  }
23
24
 
@@ -87,7 +88,8 @@ o = Bwkfanboy::Utils.cl_parse(ARGV, $conf[:banner]) # create OptionParser object
87
88
  o.on('-i', 'Show some info about the plugin') { |i| $conf[:mode] = 'info' }
88
89
  o.on('-l', 'List all plugins') { |i| $conf[:mode] = 'list' }
89
90
  o.on('-p', 'List all plugins paths') { |i| $conf[:mode] = 'path' }
90
- o.on('-D', '(ignore this) Use URI_DEBUG const instead URI in plugins') { |i| $conf[:mode] = 'debug' }
91
+ o.on('-O', '(ignore this) Execute all bwkfanboy_* utils in a pipe') { |i| $conf[:mode] = 'pipe' }
92
+ o.on('-D', '(ignore this) Use URI_DEBUG const instead URI in plugins') { |i| $conf[:debug] = true }
91
93
  Bwkfanboy::Utils.cl_parse(ARGV, $conf[:banner], o) # run cl parser
92
94
 
93
95
  plugin = Plugin.new(ARGV[0])
@@ -107,12 +109,11 @@ when 'info'
107
109
  plugin.load(opt).dump_info
108
110
  when 'version'
109
111
  puts Bwkfanboy::Meta::VERSION
110
- else
111
- # A pipe mode
112
+ when 'pipe'
112
113
  pn = plugin.load(opt)
113
114
  cmd = "./bwkfanboy_fetch | ./bwkfanboy_parse '#{plugin.path}' #{opt.size != 0 ? Shellwords.join(opt) : ''} | ./bwkfanboy_generate"
114
115
  if Bwkfanboy::Utils.cfg[:verbose] >= 2 then
115
- puts ($conf[:mode] != 'debug' ? pn.uri() : pn.class::Meta::URI_DEBUG)
116
+ puts (!$conf[:debug] ? pn.uri() : pn.class::Meta::URI_DEBUG)
116
117
  puts cmd
117
118
  exit 0
118
119
  end
@@ -121,10 +122,31 @@ else
121
122
  Dir.chdir(File.dirname(File.expand_path($0)))
122
123
 
123
124
  pipe = IO.popen(cmd, 'w+')
124
- pipe.puts ($conf[:mode] != 'debug' ? pn.uri : pn.class::Meta::URI_DEBUG)
125
+ pipe.puts (!$conf[:debug] ? pn.uri : pn.class::Meta::URI_DEBUG)
125
126
  pipe.close_write
126
127
  while line = pipe.gets
127
128
  puts line
128
129
  end
129
130
  pipe.close
131
+
132
+ else
133
+ # a 'new', faster way
134
+ pn = plugin.load(opt)
135
+
136
+ require_relative '../lib/bwkfanboy/fetch'
137
+ require_relative '../lib/bwkfanboy/generate'
138
+
139
+ # 1. fetch & parse
140
+ pn = Page.new(opt)
141
+ Bwkfanboy::Fetch.cat(!$conf[:debug] ? pn.uri() : pn.class::Meta::URI_DEBUG) {|stream|
142
+ pn.parse(stream)
143
+ }
144
+
145
+ if Bwkfanboy::Utils.cfg[:verbose] >= 2 then
146
+ pn.dump()
147
+ exit 0
148
+ end
149
+
150
+ # 2.generate
151
+ puts Bwkfanboy::Generate.atom(JSON.parse(pn.to_json))
130
152
  end
data/bin/bwkfanboy_fetch CHANGED
@@ -1,30 +1,13 @@
1
- #!/usr/bin/env ruby19
1
+ #!/usr/bin/env ruby
2
2
  # -*-ruby-*-
3
3
 
4
- # Read stdin for a URI or a full path to the local file, download it (or
5
- # read for the local file) and print to stdout.
6
-
7
- require 'open-uri'
8
-
9
- require_relative '../lib/bwkfanboy/utils'
4
+ require_relative '../lib/bwkfanboy/fetch'
10
5
 
11
6
  $conf = { banner: "Usage: #{File.basename($0)} [options] < uri" }
12
7
 
13
8
  Bwkfanboy::Utils.cl_parse(ARGV, $conf[:banner], nil, true)
14
9
 
15
- uri = gets.chomp()
16
-
17
- Bwkfanboy::Utils.veputs(1, "fetching #{uri}\n")
18
-
19
- begin
20
- open(uri, "User-Agent" => Bwkfanboy::Meta::USER_AGENT) {|f|
21
- if defined?(f.meta) && f.status[0] != '200' then
22
- Bwkfanboy::Utils.errx(1, "cannot fetch #{uri} : HTTP responce: #{f.status[0]}")
23
- end
24
- Bwkfanboy::Utils.veputs(1, "charset = #{f.content_type_parse[1][1]}\n") if defined?(f.meta)
25
- f.each_line {|i| puts i}
26
- }
27
- rescue
28
- # typically Errno::ENOENT
29
- Bwkfanboy::Utils.errx(1, "cannot fetch: #{$!}");
30
- end
10
+ uri = gets
11
+ Bwkfanboy::Fetch.cat(uri) {|f|
12
+ f.each_line {|line| puts line }
13
+ }
@@ -1,24 +1,16 @@
1
- #!/usr/bin/env ruby19
1
+ #!/usr/bin/env ruby
2
2
  # -*-ruby-*-
3
3
 
4
- # Read stdin for JSON, generate from it an Atom feed and print the
5
- # result to stdout in UTF-8.
6
- #
7
- # One can validate the JSON by providing '--check' command line option
8
- # (by default the validating is off).
9
-
10
- require 'rss/maker'
11
- require 'date'
12
- require 'json'
13
- require 'jsonschema'
14
-
15
- require_relative '../lib/bwkfanboy/utils'
4
+ require_relative '../lib/bwkfanboy/generate'
16
5
 
17
6
  $conf = {
18
7
  banner: "Usage: #{File.basename($0)} [options] < json",
19
8
  check: false
20
9
  }
21
10
 
11
+ # we are expection the input ONLY in UTF-8
12
+ Encoding.default_external = 'UTF-8'
13
+
22
14
  o = Bwkfanboy::Utils.cl_parse(ARGV, $conf[:banner])
23
15
  o.on('--check', 'Validate the input (slow!)') { |i| $conf[:check] = true }
24
16
  Bwkfanboy::Utils.cl_parse(ARGV, $conf[:banner], o) # run cl parser
@@ -29,52 +21,5 @@ rescue
29
21
  Bwkfanboy::Utils.errx(1, "stdin had invalid JSON");
30
22
  end
31
23
 
32
- # validate the input
33
- schema = Bwkfanboy::Utils.gem_dir_system() + '/schema.js'
34
- if $conf[:check] then
35
- begin
36
- JSON::Schema.validate(j, JSON.parse(File.read(schema)))
37
- rescue
38
- Bwkfanboy::Utils.errx(1, "JSON validation with schema (#{schema}) failed");
39
- end
40
- end
41
-
42
- feed = RSS::Maker.make("atom") { |maker|
43
- maker.channel.id = j['channel']['id']
44
- maker.channel.updated = j['channel']['updated']
45
- maker.channel.author = j['channel']['author']
46
- maker.channel.title = j['channel']['title']
47
-
48
- maker.channel.links.new_link {|i|
49
- i.href = j['channel']['link']
50
- i.rel = 'alternate'
51
- i.type = 'text/html' # eh
52
- }
53
-
54
- maker.items.do_sort = true
55
-
56
- j['x_entries'].each { |i|
57
- maker.items.new_item do |item|
58
- item.links.new_link {|k|
59
- k.href = i['link']
60
- k.rel = 'alternate'
61
- k.type = 'text/html' # only to make happy crappy pr2nntp gateway
62
- }
63
- item.title = i['title']
64
- item.author = i['author']
65
- item.updated = i['updated']
66
- item.content.type = j['channel']['x_entries_content_type']
67
-
68
- case item.content.type
69
- when 'text'
70
- item.content.content = i['content']
71
- when 'html'
72
- item.content.content = i['content']
73
- else
74
- item.content.xhtml = i['content']
75
- end
76
- end
77
- }
78
- }
79
-
80
- puts feed
24
+ if $conf[:check] then Bwkfanboy::Generate.validate(j) end
25
+ puts Bwkfanboy::Generate.atom(j)
data/bin/bwkfanboy_parse CHANGED
@@ -1,12 +1,6 @@
1
- #!/usr/bin/env ruby19
1
+ #!/usr/bin/env ruby
2
2
  # -*-ruby-*-
3
3
 
4
- # Take 1 command line parameter: a full path to a plugin.
5
- #
6
- # Read stdin for a HTML, parse it and print the result to stdout in JSON
7
- # format. If '-vv' command line parameters were given, output will be in
8
- # 'key: value' pairs and <em>not</em> in JSON.
9
-
10
4
  require_relative '../lib/bwkfanboy/parser'
11
5
 
12
6
  $conf = {
@@ -19,12 +13,12 @@ if ARGV.size == 0 then
19
13
  abort($conf[:banner])
20
14
  else
21
15
  Bwkfanboy::Utils.plugin_load(ARGV[0], Bwkfanboy::Meta::PLUGIN_CLASS)
22
- end;
16
+ end
23
17
 
24
18
  opt = Bwkfanboy::Utils.plugin_opts(ARGV)
25
19
  pn = Page.new(opt)
26
20
  pn.check()
27
- pn.parse()
21
+ pn.parse(STDIN)
28
22
 
29
23
  if Bwkfanboy::Utils.cfg[:verbose] >= 2 then
30
24
  pn.dump()
data/bin/bwkfanboy_server CHANGED
@@ -1,42 +1,9 @@
1
- #!/usr/bin/env ruby19
1
+ #!/usr/bin/env ruby
2
2
  # -*-ruby-*-
3
3
 
4
- # Start a HTTP server (by default on 127.0.0.1:9042). To get Atom feeds
5
- # from it, initiate GET request with URI
6
- #
7
- # http://localhost:9042/?p=PLUGIN
8
- #
9
- # where +PLUGIN+ is a name of a bwkfanboy's plugin (without '.re' suffix).
10
- # If the plugin requires additional options you can specify them like:
11
- #
12
- # http://localhost:9042/?p=PLUGIN&o=opt1%20%22opt2%20has%20spaces%22
13
- #
14
- # where <tt>opt1%20%22opt2%20has%20spaces%22</tt> is a encoded string
15
- # <tt>opt1 "opt2 has spaces"</tt>.
16
- #
17
- # To list all available plugins, point you browser to
18
- #
19
- # http://localhost:9042/list
20
- #
21
- # The server is intended to run from a non-root user from
22
- # <tt>~/.login</tt> file. It can detach from a terminal if you give it
23
- # '-d' command line option.
24
- #
25
- # For other help, type:
26
- #
27
- # bwkfanboy_server -h
28
- #
29
- # The server maintains 2 logs:
30
- #
31
- # /tmp/bwkfanboy/USER/log/bwkfanboy_server.log
32
- # /tmp/bwkfanboy/USER/log/bwkfanboy_server-access.log
33
- #
34
- # The file with a pid:
35
- #
36
- # /tmp/bwkfanboy/USER/bwkfanboy_server.pid
37
-
38
4
  require 'shellwords'
39
5
  require 'webrick'
6
+ require 'date'
40
7
  require_relative '../lib/bwkfanboy/utils'
41
8
 
42
9
  $conf = {
@@ -78,8 +45,14 @@ class FeedServlet < WEBrick::HTTPServlet::AbstractServlet # :nodoc: all
78
45
  if r[0] != 0 then
79
46
  raise WEBrick::HTTPStatus::InternalServerError.new("Errors in the pipeline:\n\n #{r[1]}")
80
47
  end
81
-
48
+
82
49
  res.body = r[2]
50
+
51
+ # search for <updated> tag and set Last-Modified header
52
+ if (m = r[2].match('<updated>(.+?)</updated>'))
53
+ res['Last-Modified'] = DateTime.parse(m.to_s).httpdate
54
+ end
55
+
83
56
  else
84
57
  raise WEBrick::HTTPStatus::InternalServerError.new("Parameter 'p' required")
85
58
  end
data/doc/NEWS.rdoc CHANGED
@@ -1,6 +1,20 @@
1
- === Current
1
+ === 1.1.4
2
2
 
3
- - See git log.
3
+ - INCOMPATIBILITY: from now on, all plugins must do NOT read the stdin
4
+ but bwkfanboy's provided stream. See doc/plugins.rdoc.
5
+
6
+ - Moved the code from bwkfanboy_* to libraries.
7
+
8
+ - From now on, bwkfanboy util by default do not run utils in a pipe but
9
+ uses libraries directly. One can restore the old functionality with
10
+ '-O' CL swith.
11
+
12
+ === 0.1.4
13
+
14
+ - The minimum required Ruby version is 1.9.2.
15
+ - bwkfanboy_server now inserts a Last-Modified header.
16
+ - Fixed (?) a bug in bwkfanboy_generate with external encoding.
17
+ - Updated tests for Ruby 1.9.2.
4
18
 
5
19
  === 0.1.3
6
20
 
data/doc/README.rdoc CHANGED
@@ -11,6 +11,10 @@ general assistance.
11
11
  than the whole gem on rubygems.org, so grab the source before
12
12
  struggling).
13
13
 
14
+ Plugins from version 1.1.4 are *incompatible* with the 0.1.x
15
+ series. Please reread in doc/plugin.rdoc the example of the skeleton
16
+ plugin.
17
+
14
18
  = Architecture
15
19
 
16
20
  == Plugins
@@ -18,10 +22,10 @@ struggling).
18
22
  bwkfanboy comes with several plugins. One of them, for example, parses a
19
23
  search page of dailyprincetonian.com looking for bwk's articles.
20
24
 
21
- The plugin is a Ruby class +Page+ that inherits Bwkfanboy::Parse
22
- parent, overriding 1 method.
25
+ The plugin is a Ruby class +Page+ that inherits Bwkfanboy::Parse parent,
26
+ overriding 1 method.
23
27
 
24
- The plugins can be in the system
28
+ Plugins can be in system
25
29
 
26
30
  `gem env gemdir`/gems/bwkfanboy-x.y.z/lib/bwkfanboy/plugins
27
31
 
@@ -89,7 +93,7 @@ There are 2 method to get an Atom feed via HTTP:
89
93
 
90
94
  2. Small *bwkfanboy_server* HTTP server. It can run from any user and
91
95
  thus is able to inherit env variables for discovering your HOME
92
- directory. Read bin/bwkfanboy_server to know how to operate it.
96
+ directory. Read doc/bwkfanboy_server.rdoc to know how to operate it.
93
97
 
94
98
  = License
95
99
  :include: doc/LICENSE
@@ -0,0 +1,4 @@
1
+ = bwkfanboy_fetch
2
+
3
+ Read stdin for a URI or a full path to the local file, download it (or
4
+ read from the local file) and print the result to stdout.
@@ -0,0 +1,7 @@
1
+ = bwkfanboy_generate
2
+
3
+ Read stdin for JSON, generate from it an Atom feed and print the
4
+ result to stdout in UTF-8.
5
+
6
+ One can validate the JSON by providing '--check' command line option
7
+ (by default the validating is off).
@@ -0,0 +1,7 @@
1
+ = bwkfanboy_parse
2
+
3
+ Takes 1 command line parameter: a full path to a plugin.
4
+
5
+ Reads stdin for a HTML, parses it and prints the result to stdout in
6
+ JSON format. If '-vv' command line parameters were given, output will
7
+ be in 'key: value' pairs and <em>not</em> in JSON.
@@ -0,0 +1,35 @@
1
+ = bwkfanboy_server
2
+
3
+ Start a HTTP server (by default on 127.0.0.1:9042). To get Atom feeds
4
+ from it, initiate GET request with URI
5
+
6
+ http://localhost:9042/?p=PLUGIN
7
+
8
+ where +PLUGIN+ is a name of a bwkfanboy's plugin (without '.re' suffix).
9
+ If the plugin requires additional options you can specify them like:
10
+
11
+ http://localhost:9042/?p=PLUGIN&o=opt1%20%22opt2%20has%20spaces%22
12
+
13
+ where <tt>opt1%20%22opt2%20has%20spaces%22</tt> is a encoded string
14
+ <tt>opt1 "opt2 has spaces"</tt>.
15
+
16
+ To list all available plugins, point you browser to
17
+
18
+ http://localhost:9042/list
19
+
20
+ The server is intended to run from a non-root user from
21
+ <tt>~/.login</tt> file. It can detach from a terminal if you give it
22
+ '-d' command line option.
23
+
24
+ For other help, type:
25
+
26
+ bwkfanboy_server -h
27
+
28
+ The server maintains 2 logs:
29
+
30
+ /tmp/bwkfanboy/USER/log/bwkfanboy_server.log
31
+ /tmp/bwkfanboy/USER/log/bwkfanboy_server-access.log
32
+
33
+ The file with a pid:
34
+
35
+ /tmp/bwkfanboy/USER/bwkfanboy_server.pid
data/doc/plugin.rdoc CHANGED
@@ -1,12 +1,11 @@
1
- = HOWTO Write a \Plugin
1
+ = How to Write a \Plugin
2
2
 
3
3
  First of all, look at examples provided with bwkfanboy. They were
4
4
  intended to be 100% working because I was writing them for myself.
5
5
 
6
- Basically, all you need is to write a class named _Page_ that
7
- inherits this class Bwkfanboy::Parse, override in the child #myparse
8
- method and write a simple module named _Meta_ inside your _Page_
9
- class.
6
+ Basically, all you need is to write a class named _Page_ that inherits
7
+ class Bwkfanboy::Parse, override in the child \#myparse method and write
8
+ a simple module named _Meta_ inside your _Page_ class.
10
9
 
11
10
  == Skeleton
12
11
 
@@ -24,9 +23,9 @@ Here is a skeleton of a plugin:
24
23
  CONTENT_TYPE = 'html'
25
24
  end
26
25
 
27
- def myparse()
28
- # read stdin and parse it
29
- doc = Nokogiri::HTML(STDIN, nil, Meta::ENC)
26
+ def myparse(stream)
27
+ # read 'stream' IO object and parse it
28
+ doc = Nokogiri::HTML(stream, nil, Meta::ENC)
30
29
  doc.xpath("XPATH QUERY").each {|i|
31
30
  t = clean(i.xpath("XPATH QUERY").text())
32
31
  l = clean(i.xpath("XPATH QUERY").text())
@@ -52,8 +51,8 @@ it should.
52
51
 
53
52
  === \Meta
54
53
 
55
- Module _Meta_ can have only constants--and *all* constants listed in
56
- the skeleton are required.
54
+ Module _Meta_ can only have constants--and *all* constants listed in the
55
+ skeleton are mandatory.
57
56
 
58
57
  * <tt>URI</tt>--can be a <tt>http(s)://</tt> or <tt>ftp://</tt> URL
59
58
  or just a path to a file on your local machine, as
@@ -75,13 +74,17 @@ the skeleton are required.
75
74
 
76
75
  === myparse
77
76
 
78
- In #myparse method please read stdin. The contends of it is the raw
79
- HTML you want to parse. The general idea:
77
+ In \#myparse method read 'stream' IO object. The contents of it is the
78
+ raw HTML you want to parse. The general idea:
80
79
 
81
- * Atom feed must contain at least 1 entry, so look in HTML for some
82
- crap which you break into 5 peaces: title of the entry, link for
83
- it, a date for the entry, who is author of the entry and its
84
- contents.
80
+ * Atom feed must contain at least 1 entry, so look in the HTML for some
81
+ crap which you must break into 5 peaces:
82
+
83
+ - a title of the entry
84
+ - a link for it
85
+ - a date for the entry
86
+ - who is the author of the entry and
87
+ - its contents.
85
88
 
86
89
  * After you scan and grab 1 entry, create a hash and add it to
87
90
  _self_ as it was in the skeleton:
@@ -89,20 +92,20 @@ HTML you want to parse. The general idea:
89
92
  self << { title: t, link: l, updated: u, author: a, content: c }
90
93
 
91
94
  Here variables _t_, _l_, _u_, _a_ and _c_ contains the actual
92
- values of 5 peaces for the entry. Names of the keys in hash are
95
+ value of 5 peaces for the entry. Names of the keys in the hash are
93
96
  important of course--don't invent your own.
94
97
 
95
- * There would be probably more crap in HTML that you can use to
98
+ * Probably there would be more crap in the HTML that you can use to
96
99
  construct another entry. Keep parsing and adding entries.
97
100
 
98
- * While you scanning, use the 2 helper methods for cleaning each
99
- peace: \#clean, which removed duplicate spaces and #date, which
100
- parses a sting and return a date in ISO8601 format. You may
101
- override #date method if you like.
101
+ * While you're scanning, use the 2 helper methods for cleaning each
102
+ peace: \#clean, which removed duplicate spaces and \#date, which parses
103
+ a string and return a date in ISO8601 format. You may override \#date
104
+ method if you like.
102
105
 
103
106
  === Options
104
107
 
105
- Plugins can have _options_ and a user should provide then to the plugin
108
+ Plugins can have _options_ and a user should provide them to the plugin
106
109
  in the real-time. For example, say you're scraping a site where many
107
110
  users are wasting their time. If you want to watch for several of them
108
111
  it is silly to write a new plugin every time for a new
@@ -130,20 +133,20 @@ _option_ becomes mandatory for the end-user.
130
133
  To test how nice your plugin works, save the html page to the file
131
134
  and type:
132
135
 
133
- % bwkfanboy_parse -vv path/to/a/plugin.rb < saved_page.html
136
+ % bwkfanboy_parse -vv /path/to/the/plugin.rb < saved_page.html
134
137
 
135
138
  to see the result as in plain text, or
136
139
 
137
- % bwkfanboy_parse -v path/to/a/plugin.rb < saved_page.html
140
+ % bwkfanboy_parse -v /path/to/the/plugin.rb < saved_page.html
138
141
 
139
- as pretty JSON.
142
+ as a pretty JSON.
140
143
 
141
144
  For option-enabled plugins, supply additional parameters for them after
142
145
  the plugin path:
143
146
 
144
- % bwkfanboy_parse -vv path/to/a/plugin.rb \
145
- option_1 "options 2" < saved_page.html
147
+ % bwkfanboy_parse -vv /path/to/the/plugin.rb \
148
+ option_1 "option 2" < saved_page.html
146
149
 
147
150
  <tt>bwkfanboy_parse</tt> return 0 if no errors occurred or >= 1 if you
148
151
  have errors in your plugin code. N.B.: the output from
149
- <tt>bwkparser_parse</tt> is always in UTF-8.
152
+ <tt>bwkparser_parse</tt> *must* always be in UTF-8.
@@ -0,0 +1,36 @@
1
+ require 'open-uri'
2
+
3
+ require_relative 'utils'
4
+
5
+ module Bwkfanboy
6
+ class Fetch
7
+
8
+ # If no block given, return contents of fetch'ed URI. Otherwise,
9
+ # execute the block with 1 parameter--stream.
10
+ def self.cat(uri)
11
+ uri.chomp!
12
+
13
+ Bwkfanboy::Utils.veputs(1, "fetching #{uri}\n")
14
+
15
+ begin
16
+ open(uri, "User-Agent" => Bwkfanboy::Meta::USER_AGENT) {|f|
17
+ if defined?(f.meta) && f.status[0] != '200' then
18
+ Bwkfanboy::Utils.errx(1, "cannot fetch #{uri} : HTTP responce: #{f.status[0]}")
19
+ end
20
+ Bwkfanboy::Utils.veputs(1, "charset = #{f.content_type_parse[1][1]}\n") if defined?(f.meta)
21
+ if block_given?
22
+ yield f
23
+ else
24
+ return f.read
25
+ end
26
+ }
27
+ rescue
28
+ # typically Errno::ENOENT
29
+ Bwkfanboy::Utils.errx(1, "cannot fetch: #{$!}");
30
+ end
31
+
32
+ return ""
33
+ end
34
+
35
+ end
36
+ end
@@ -0,0 +1,63 @@
1
+ require 'rss/maker'
2
+ require 'date'
3
+ require 'json'
4
+ require 'jsonschema'
5
+
6
+ require_relative 'utils'
7
+
8
+ module Bwkfanboy
9
+ class Generate
10
+
11
+ def self.validate(t)
12
+ schema = Bwkfanboy::Utils.gem_dir_system() + '/schema.js'
13
+ begin
14
+ JSON::Schema.validate(t, JSON.parse(File.read(schema)))
15
+ rescue
16
+ Bwkfanboy::Utils.errx(1, "JSON validation with schema (#{schema}) failed");
17
+ end
18
+ end
19
+
20
+ def self.atom(src)
21
+ feed = RSS::Maker.make("atom") { |maker|
22
+ maker.channel.id = src['channel']['id']
23
+ maker.channel.updated = src['channel']['updated']
24
+ maker.channel.author = src['channel']['author']
25
+ maker.channel.title = src['channel']['title']
26
+
27
+ maker.channel.links.new_link {|i|
28
+ i.href = src['channel']['link']
29
+ i.rel = 'alternate'
30
+ i.type = 'text/html' # eh
31
+ }
32
+
33
+ maker.items.do_sort = true
34
+
35
+ src['x_entries'].each { |i|
36
+ maker.items.new_item do |item|
37
+ item.links.new_link {|k|
38
+ k.href = i['link']
39
+ k.rel = 'alternate'
40
+ k.type = 'text/html' # only to make happy crappy pr2nntp gateway
41
+ }
42
+ item.title = i['title']
43
+ item.author = i['author']
44
+ item.updated = i['updated']
45
+ item.content.type = src['channel']['x_entries_content_type']
46
+
47
+ case item.content.type
48
+ when 'text'
49
+ item.content.content = i['content']
50
+ when 'html'
51
+ item.content.content = i['content']
52
+ else
53
+ item.content.xhtml = i['content']
54
+ end
55
+ end
56
+ }
57
+ }
58
+
59
+ return feed
60
+ end
61
+
62
+ end
63
+ end
@@ -8,7 +8,7 @@ module Bwkfanboy
8
8
 
9
9
  # :include: ../../doc/plugin.rdoc
10
10
  class Parse
11
- ENTRIES_MAX = 64
11
+ ENTRIES_MAX = 128
12
12
 
13
13
  attr_reader :opt
14
14
 
@@ -18,10 +18,10 @@ module Bwkfanboy
18
18
  end
19
19
 
20
20
  # Invokes #myparse & checks if it has grabbed something.
21
- def parse()
21
+ def parse(stream)
22
22
  @entries = []
23
23
  begin
24
- myparse()
24
+ myparse(stream)
25
25
  rescue
26
26
  @entries = []
27
27
  Utils.errx(1, "parsing failed: #{$!}\n\nBacktrace:\n\n#{$!.backtrace.join("\n")}")
@@ -99,7 +99,7 @@ module Bwkfanboy
99
99
  protected
100
100
 
101
101
  # This *must* be overridden in the child.
102
- def myparse()
102
+ def myparse(stream)
103
103
  raise "plugin isn't finished yet"
104
104
  end
105
105
 
@@ -8,16 +8,16 @@ class Page < Bwkfanboy::Parse
8
8
  URI = 'http://www.dailyprincetonian.com/advanced_search/?author=Brian+Kernighan'
9
9
  URI_DEBUG = '/home/alex/lib/software/alex/bwkfanboy/test/semis/bwk.html'
10
10
  ENC = 'UTF-8'
11
- VERSION = 1
11
+ VERSION = 2
12
12
  COPYRIGHT = "See bwkfanboy's LICENSE file"
13
13
  TITLE = "Brian Kernighan's articles from Daily Princetonian"
14
14
  CONTENT_TYPE = 'html'
15
15
  end
16
16
 
17
- def myparse()
17
+ def myparse(stream)
18
18
  url = "http://www.dailyprincetonian.com"
19
19
 
20
- doc = Nokogiri::HTML(STDIN, nil, Meta::ENC)
20
+ doc = Nokogiri::HTML(stream, nil, Meta::ENC)
21
21
  doc.xpath("//div[@class='article_item']").each {|i|
22
22
  t = clean(i.xpath("h2/a").children.text())
23
23
  fail 'unable to extract link' if (link = clean(i.xpath("h2/a")[0].attributes['href'].value()).empty?)
@@ -5,7 +5,7 @@ class Page < Bwkfanboy::Parse
5
5
  URI = '/usr/ports/UPDATING'
6
6
  URI_DEBUG = URI
7
7
  ENC = 'ASCII'
8
- VERSION = 1
8
+ VERSION = 2
9
9
  COPYRIGHT = "See bwkfanboy's LICENSE file"
10
10
  TITLE = "News from FreeBSD ports"
11
11
  CONTENT_TYPE = 'text'
@@ -24,7 +24,7 @@ class Page < Bwkfanboy::Parse
24
24
  return t
25
25
  end
26
26
 
27
- def myparse()
27
+ def myparse(stream)
28
28
  re_u = /^(\d{8}):$/
29
29
  re_t1 = /^ {2}AFFECTS:\s+(.+)$/
30
30
  re_t2 = /^\s+(.+)$/
@@ -33,7 +33,7 @@ class Page < Bwkfanboy::Parse
33
33
  ready = false
34
34
  mode = nil
35
35
  t = l = u = a = c = nil
36
- while line = STDIN.gets
36
+ while line = stream.gets
37
37
  line.rstrip!
38
38
 
39
39
  if line =~ re_u then
@@ -17,17 +17,17 @@ class Page < Bwkfanboy::Parse
17
17
  URI = 'http://www.quora.com/#{opt[0]}/answers'
18
18
  URI_DEBUG = '/home/alex/lib/software/alex/bwkfanboy/test/semis/quora.html'
19
19
  ENC = 'UTF-8'
20
- VERSION = 4
20
+ VERSION = 5
21
21
  COPYRIGHT = "See bwkfanboy's LICENSE file"
22
22
  TITLE = "Last n answers (per-user) from Quora; requires nodejs"
23
23
  CONTENT_TYPE = 'html'
24
24
  end
25
25
 
26
- def myparse()
26
+ def myparse(stream)
27
27
  profile = opt[0] # for example, 'Brandon-Smietana'
28
28
 
29
29
  # read stdin
30
- doc = Nokogiri::HTML(STDIN, nil, Meta::ENC)
30
+ doc = Nokogiri::HTML(stream, nil, Meta::ENC)
31
31
 
32
32
  # extract & evaluate JavaScript into tstp
33
33
  tstp = nil
@@ -7,7 +7,7 @@ require 'active_support/core_ext/module/attribute_accessors'
7
7
  module Bwkfanboy
8
8
  module Meta
9
9
  NAME = 'bwkfanboy'
10
- VERSION = '0.1.3'
10
+ VERSION = '1.1.4'
11
11
  USER_AGENT = "#{NAME}/#{VERSION} (#{RUBY_PLATFORM}; N; #{Encoding.default_external.name}; #{RUBY_ENGINE}; rv:#{RUBY_VERSION}.#{RUBY_PATCHLEVEL})"
12
12
  PLUGIN_CLASS = 'Page'
13
13
  DIR_TMP = "/tmp/#{Meta::NAME}/#{ENV['USER']}"
@@ -89,7 +89,7 @@ module Bwkfanboy
89
89
  # TODO get rid of eval()
90
90
  fail "class #{class_name} isn't defined" if (! eval("defined?#{class_name}") || ! eval(class_name).is_a?(Class) )
91
91
  rescue LoadError
92
- errx(1, "cannot load plugin '#{path}'");
92
+ errx(1, "cannot load plugin '#{path}' #{$!}");
93
93
  rescue Exception
94
94
  errx(1, "plugin '#{path}' has errors: #{$!}\n\nBacktrace:\n\n#{$!.backtrace.join("\n")}")
95
95
  end
data/test/test_fetch.rb CHANGED
@@ -1,10 +1,7 @@
1
- #!/usr/bin/env ruby19
2
-
3
- require 'minitest/autorun'
4
1
  require 'digest/md5'
5
2
 
6
3
  require_relative '../lib/bwkfanboy/utils'
7
- require_relative 'ts_utils.rb'
4
+ require_relative 'ts_utils'
8
5
 
9
6
  # TODO add HTTP 404 check; drop connection from server during HTTP 200
10
7
  # replay...
@@ -1,6 +1,3 @@
1
- #!/usr/bin/env ruby19
2
-
3
- require 'minitest/autorun'
4
1
  require 'digest/md5'
5
2
 
6
3
  require_relative '../lib/bwkfanboy/utils'
data/test/test_parse.rb CHANGED
@@ -1,6 +1,3 @@
1
- #!/usr/bin/env ruby19
2
-
3
- require 'minitest/autorun'
4
1
  require 'digest/md5'
5
2
 
6
3
  require_relative '../lib/bwkfanboy/utils'
@@ -17,16 +14,16 @@ class TestParse < MiniTest::Unit::TestCase
17
14
 
18
15
  def test_empty_plugin
19
16
  cmd CMD
20
- r = Bwkfanboy::Utils.cmd_run("#{cmd CMD} #{@tpath}plugins/empty.rb ")
17
+ r = Bwkfanboy::Utils.cmd_run("#{cmd CMD} #{Dir.pwd}/#{@tpath}plugins/empty.rb ")
21
18
  assert_equal(1, r[0])
22
19
  assert_match(/plugin .+ has errors: class Page isn't defined/, r[1])
23
20
  end
24
21
 
25
22
  def test_plugin_parse
26
23
  cmd CMD
27
- r = Bwkfanboy::Utils.cmd_run("#{cmd CMD} #{@tpath}plugins/bwk.rb < #{@tpath}semis/bwk.html")
24
+ r = Bwkfanboy::Utils.cmd_run("#{cmd CMD} #{Dir.pwd}/#{@tpath}../lib/bwkfanboy/plugins/bwk.rb < #{@tpath}semis/bwk.html")
28
25
  assert_equal(0, r[0])
29
- # bin/bwkfanboy_parse test/plugins/bwk.rb < test/semis/bwk.html | md5
30
- assert_equal('371fb5a5c5b5519b5eff085df2d31e18', Digest::MD5.hexdigest(r[2]))
26
+ # bin/bwkfanboy_parse `pwd`/lib/bwkfanboy/plugins/bwk.rb < test/semis/bwk.html | md5
27
+ assert_equal('a433a4a27bafb060a41aa85a40808056', Digest::MD5.hexdigest(r[2]))
31
28
  end
32
29
  end
data/test/test_server.rb CHANGED
@@ -1,6 +1,3 @@
1
- #!/usr/bin/env ruby19
2
-
3
- require 'minitest/autorun'
4
1
  require 'open-uri'
5
2
  require 'digest/md5'
6
3
 
data/test/ts_utils.rb CHANGED
@@ -1,3 +1,11 @@
1
+ # don't run test automatically
2
+ # if they were invoked as 'gem check -t ...'
3
+ if $0 =~ /gem/
4
+ require 'minitest/unit'
5
+ else
6
+ require 'minitest/autorun'
7
+ end
8
+
1
9
  # return the right directory for _c_
2
10
  def cmd(c)
3
11
  @tpath = ''
metadata CHANGED
@@ -3,10 +3,10 @@ name: bwkfanboy
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
- - 0
7
6
  - 1
8
- - 3
9
- version: 0.1.3
7
+ - 1
8
+ - 4
9
+ version: 1.1.4
10
10
  platform: ruby
11
11
  authors:
12
12
  - Alexander Gromnitsky
@@ -14,27 +14,29 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-10-29 00:00:00 +03:00
17
+ date: 2010-11-08 00:00:00 +02:00
18
18
  default_executable: bwkfanboy
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  name: activesupport
22
22
  prerelease: false
23
23
  requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
24
25
  requirements:
25
26
  - - ">="
26
27
  - !ruby/object:Gem::Version
27
28
  segments:
28
29
  - 3
29
30
  - 0
30
- - 0
31
- version: 3.0.0
31
+ - 1
32
+ version: 3.0.1
32
33
  type: :runtime
33
34
  version_requirements: *id001
34
35
  - !ruby/object:Gem::Dependency
35
36
  name: nokogiri
36
37
  prerelease: false
37
38
  requirement: &id002 !ruby/object:Gem::Requirement
39
+ none: false
38
40
  requirements:
39
41
  - - ">="
40
42
  - !ruby/object:Gem::Version
@@ -49,6 +51,7 @@ dependencies:
49
51
  name: open4
50
52
  prerelease: false
51
53
  requirement: &id003 !ruby/object:Gem::Requirement
54
+ none: false
52
55
  requirements:
53
56
  - - ">="
54
57
  - !ruby/object:Gem::Version
@@ -63,6 +66,7 @@ dependencies:
63
66
  name: jsonschema
64
67
  prerelease: false
65
68
  requirement: &id004 !ruby/object:Gem::Requirement
69
+ none: false
66
70
  requirements:
67
71
  - - ">="
68
72
  - !ruby/object:Gem::Version
@@ -84,15 +88,14 @@ executables:
84
88
  extensions: []
85
89
 
86
90
  extra_rdoc_files:
87
- - bin/bwkfanboy_generate
88
- - bin/bwkfanboy_parse
89
- - bin/bwkfanboy
90
- - bin/bwkfanboy_server
91
- - bin/bwkfanboy_fetch
92
- - doc/plugin.rdoc
93
- - doc/README.rdoc
94
91
  - doc/LICENSE
95
92
  - doc/NEWS.rdoc
93
+ - doc/README.rdoc
94
+ - doc/plugin.rdoc
95
+ - doc/bwkfanboy_fetch.rdoc
96
+ - doc/bwkfanboy_generate.rdoc
97
+ - doc/bwkfanboy_parse.rdoc
98
+ - doc/bwkfanboy_server.rdoc
96
99
  files:
97
100
  - lib/bwkfanboy/plugins/bwk.rb
98
101
  - lib/bwkfanboy/plugins/freebsd-ports-update.rb
@@ -101,20 +104,25 @@ files:
101
104
  - lib/bwkfanboy/parser.rb
102
105
  - lib/bwkfanboy/utils.rb
103
106
  - lib/bwkfanboy/schema.js
107
+ - lib/bwkfanboy/fetch.rb
108
+ - lib/bwkfanboy/generate.rb
104
109
  - bin/bwkfanboy_generate
105
110
  - bin/bwkfanboy_parse
106
111
  - bin/bwkfanboy
107
112
  - bin/bwkfanboy_server
108
113
  - bin/bwkfanboy_fetch
109
- - doc/plugin.rdoc
110
- - doc/README.rdoc
111
114
  - doc/LICENSE
112
115
  - doc/NEWS.rdoc
116
+ - doc/README.rdoc
117
+ - doc/plugin.rdoc
118
+ - doc/bwkfanboy_fetch.rdoc
119
+ - doc/bwkfanboy_generate.rdoc
120
+ - doc/bwkfanboy_parse.rdoc
121
+ - doc/bwkfanboy_server.rdoc
113
122
  - README.rdoc
114
123
  - Rakefile
115
124
  - TODO
116
125
  - test/plugins/empty.rb
117
- - test/plugins/bwk.rb
118
126
  - test/semis/bwk.html
119
127
  - test/semis/bwk.json
120
128
  - test/semis/quora.html
@@ -134,20 +142,23 @@ licenses: []
134
142
  post_install_message:
135
143
  rdoc_options:
136
144
  - -m
137
- - Bwkfanboy
145
+ - doc/README.rdoc
138
146
  - -x
139
147
  - plugins
140
148
  require_paths:
141
149
  - lib
142
150
  required_ruby_version: !ruby/object:Gem::Requirement
151
+ none: false
143
152
  requirements:
144
153
  - - ">="
145
154
  - !ruby/object:Gem::Version
146
155
  segments:
147
156
  - 1
148
157
  - 9
149
- version: "1.9"
158
+ - 2
159
+ version: 1.9.2
150
160
  required_rubygems_version: !ruby/object:Gem::Requirement
161
+ none: false
151
162
  requirements:
152
163
  - - ">="
153
164
  - !ruby/object:Gem::Version
@@ -157,7 +168,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
157
168
  requirements: []
158
169
 
159
170
  rubyforge_project:
160
- rubygems_version: 1.3.6
171
+ rubygems_version: 1.3.7
161
172
  signing_key:
162
173
  specification_version: 3
163
174
  summary: A converter from HTML to Atom feed that you can use to watch sites that do not provide its own feed.
data/test/plugins/bwk.rb DELETED
@@ -1,29 +0,0 @@
1
- require 'nokogiri'
2
-
3
- class Page < Bwkfanboy::Parse
4
- module Meta
5
- URI = "html/bwk.html"
6
- ENC = 'UTF-8'
7
- VERSION = 1
8
- COPYRIGHT = '(c) 2010 Alexander Gromnitsky'
9
- TITLE = "Brian Kernighan's articles from Daily Princetonian"
10
- CONTENT_TYPE = 'html'
11
- end
12
-
13
- def myparse()
14
- url = "http://www.dailyprincetonian.com"
15
-
16
- doc = Nokogiri::HTML(STDIN, nil, Meta::ENC)
17
- doc.xpath("//div[@class='article_item']").each {|i|
18
- t = clean(i.xpath("h2/a").children.text())
19
- fail 'unable to extract link' if (link = clean(i.xpath("h2/a")[0].attributes['href'].value()).empty?)
20
- link = clean(i.xpath("h2/a")[0].attributes['href'].value())
21
- l = url + link + "print"
22
- u = date(i.xpath("h2").children[1].text())
23
- a = clean(i.xpath("div/span/a[1]").children.text())
24
- c = clean(i.xpath("div[@class='summary']").text())
25
-
26
- self << { title: t, link: l, updated: u, author: a, content: c }
27
- }
28
- end
29
- end