liveblog-indexer 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 56937c6f8eb2b40f417e73a5512fa60593647517
4
- data.tar.gz: a87152912b799beab92e305565065ee391364b32
3
+ metadata.gz: 711ba6e2a290dfc67dfc1703ffcddf417c7c90d3
4
+ data.tar.gz: 5525552551eee1953370b99034d2a712bb308ac2
5
5
  SHA512:
6
- metadata.gz: 9e99db1533e3279c4d4b70ddb58609fb5e234e8e9124d401bff3970d5e8d2ffa5671556d1ec0e6706c0212868bde0c469f5e07e5a85609838aa5494df7bf9426
7
- data.tar.gz: 9d58190002e8cb8b490c61879f47901fe07c0ec374546dd52fae46d0c5c95ec8a6dce1c92eff850e957c0d776fe99f59577eb5c32f1bf46c2512a516140f385e
6
+ metadata.gz: 8648c96fdf3f0bf7c0b1ccc72f99bef2759db323a159e4361fc479f7d0d5847b2ad0f04361533e161f1a0cd6eb5b85544823f0159cd270adb29e6ac00cce10ed
7
+ data.tar.gz: 075d272a0af388c4addbcfb141ac0de8143d60b5b7ada4575d93a9ba84cc8e5b24b3c08490171196708012dbbed65dd5f2a588bdde538290f20ba121f79ced1a
checksums.yaml.gz.sig CHANGED
Binary file
data.tar.gz.sig CHANGED
Binary file
@@ -4,13 +4,14 @@
4
4
 
5
5
  require 'xws'
6
6
  require 'json'
7
- require 'polyrex'
7
+ require 'rxfhelper'
8
8
 
9
9
 
10
10
  class LiveBlogIndexer
11
11
 
12
- def initialize(filepath=nil)
12
+ def initialize(filepath='wordindex.json', urls_indexed: 'urls-indexed.json')
13
13
 
14
+ @wordindex_filepath, @urls_index_filepath = filepath, urls_indexed
14
15
  @master = if filepath and File.exists? filepath then
15
16
  JSON.parse(File.read(filepath))
16
17
  else
@@ -18,17 +19,26 @@ class LiveBlogIndexer
18
19
  end
19
20
 
20
21
  @xws = XWS.new
22
+ @url_index = {}
21
23
 
22
24
  end
23
25
 
24
26
  def add_index(src)
25
27
 
26
- doc = Rexle.new(RXFHelper.read(src).first )
28
+ doc = if src.is_a? String then
29
+ Rexle.new(RXFHelper.read(src).first )
30
+ else
31
+ src
32
+ end
33
+
34
+ link = doc.root.element('summary/link/text()')
35
+ return unless link
36
+
27
37
  sections = doc.root.xpath 'records/section'
28
38
 
29
39
  sections.each do |section|
30
40
 
31
- url = "%s/#%s" % [src[/^https?:\/\/[^\/]+(.*)(?=\/formatted.xml$)/,1], \
41
+ url = "%s/#%s" % [link[/^https?:\/\/[^\/]+(.*)(?=\/$)/,1], \
32
42
  section.attributes[:id]]
33
43
 
34
44
  h = @xws.scan section.element('details')
@@ -52,25 +62,45 @@ class LiveBlogIndexer
52
62
  end # /h
53
63
  end # /section
54
64
  end # /add_index
55
-
65
+
66
+ def crawl(location)
67
+
68
+ index_file location
69
+ save @wordindex_filepath
70
+ File.write @urls_index_filepath, @url_index.to_json
71
+
72
+ end
56
73
 
57
74
  def save(filepath=nil)
58
75
 
59
76
  File.write filepath, @master.to_json
60
77
  puts 'saved ' + File.basename(filepath)
61
78
 
62
- px = Polyrex.new 'words/key[word]/location[url, wordcount]'
63
-
64
- @master.each do |k,rows|
65
- px.create.key(word: k) do |create|
66
- rows.each do |k, v|
67
- create.location url: k, wordcount: v
68
- end
69
- end
79
+ end
80
+
81
+ private
82
+
83
+ def index_file(location)
84
+
85
+ puts 'indexing : ' + location.inspect
86
+ doc = Rexle.new(RXFHelper.read(location).first)
87
+ summary = doc.root.element 'summary'
88
+ return unless summary
89
+
90
+ result = add_index doc
91
+ return unless result
92
+
93
+ prev_day = summary.text 'prev_day'
94
+
95
+ if prev_day then
96
+ url = prev_day + 'formatted.xml'
97
+ @url_index[url] = {last_indexed: Time.now}
98
+ index_file(url)
70
99
  end
71
- pxfilepath = File.join(File.dirname(filepath), 'search-index.xml')
72
- px.save pxfilepath, options: {pretty: true}
73
- puts 'saved ' + File.basename(pxfilepath)
100
+ end
101
+
102
+ def save_urlsindex(filepath)
103
+
74
104
  end
75
105
 
76
- end
106
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: liveblog-indexer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -31,7 +31,7 @@ cert_chain:
31
31
  mOnsi2V1CXpq2biJtSgD7mBx4cO9FXgbK3Xnsv45ygAPo6jj4Eb34udqz+0v88Ys
32
32
  KoTqNQOniHAW2w==
33
33
  -----END CERTIFICATE-----
34
- date: 2015-11-12 00:00:00.000000000 Z
34
+ date: 2015-11-13 00:00:00.000000000 Z
35
35
  dependencies:
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: xws
@@ -54,25 +54,25 @@ dependencies:
54
54
  - !ruby/object:Gem::Version
55
55
  version: 0.1.1
56
56
  - !ruby/object:Gem::Dependency
57
- name: polyrex
57
+ name: rxfhelper
58
58
  requirement: !ruby/object:Gem::Requirement
59
59
  requirements:
60
60
  - - "~>"
61
61
  - !ruby/object:Gem::Version
62
- version: '1.0'
62
+ version: '0.2'
63
63
  - - ">="
64
64
  - !ruby/object:Gem::Version
65
- version: 1.0.11
65
+ version: 0.2.3
66
66
  type: :runtime
67
67
  prerelease: false
68
68
  version_requirements: !ruby/object:Gem::Requirement
69
69
  requirements:
70
70
  - - "~>"
71
71
  - !ruby/object:Gem::Version
72
- version: '1.0'
72
+ version: '0.2'
73
73
  - - ">="
74
74
  - !ruby/object:Gem::Version
75
- version: 1.0.11
75
+ version: 0.2.3
76
76
  description:
77
77
  email: james@r0bertson.co.uk
78
78
  executables: []
metadata.gz.sig CHANGED
Binary file