liveblog-indexer 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 56937c6f8eb2b40f417e73a5512fa60593647517
4
- data.tar.gz: a87152912b799beab92e305565065ee391364b32
3
+ metadata.gz: 711ba6e2a290dfc67dfc1703ffcddf417c7c90d3
4
+ data.tar.gz: 5525552551eee1953370b99034d2a712bb308ac2
5
5
  SHA512:
6
- metadata.gz: 9e99db1533e3279c4d4b70ddb58609fb5e234e8e9124d401bff3970d5e8d2ffa5671556d1ec0e6706c0212868bde0c469f5e07e5a85609838aa5494df7bf9426
7
- data.tar.gz: 9d58190002e8cb8b490c61879f47901fe07c0ec374546dd52fae46d0c5c95ec8a6dce1c92eff850e957c0d776fe99f59577eb5c32f1bf46c2512a516140f385e
6
+ metadata.gz: 8648c96fdf3f0bf7c0b1ccc72f99bef2759db323a159e4361fc479f7d0d5847b2ad0f04361533e161f1a0cd6eb5b85544823f0159cd270adb29e6ac00cce10ed
7
+ data.tar.gz: 075d272a0af388c4addbcfb141ac0de8143d60b5b7ada4575d93a9ba84cc8e5b24b3c08490171196708012dbbed65dd5f2a588bdde538290f20ba121f79ced1a
checksums.yaml.gz.sig CHANGED
Binary file
data.tar.gz.sig CHANGED
Binary file
@@ -4,13 +4,14 @@
4
4
 
5
5
  require 'xws'
6
6
  require 'json'
7
- require 'polyrex'
7
+ require 'rxfhelper'
8
8
 
9
9
 
10
10
  class LiveBlogIndexer
11
11
 
12
- def initialize(filepath=nil)
12
+ def initialize(filepath='wordindex.json', urls_indexed: 'urls-indexed.json')
13
13
 
14
+ @wordindex_filepath, @urls_index_filepath = filepath, urls_indexed
14
15
  @master = if filepath and File.exists? filepath then
15
16
  JSON.parse(File.read(filepath))
16
17
  else
@@ -18,17 +19,26 @@ class LiveBlogIndexer
18
19
  end
19
20
 
20
21
  @xws = XWS.new
22
+ @url_index = {}
21
23
 
22
24
  end
23
25
 
24
26
  def add_index(src)
25
27
 
26
- doc = Rexle.new(RXFHelper.read(src).first )
28
+ doc = if src.is_a? String then
29
+ Rexle.new(RXFHelper.read(src).first )
30
+ else
31
+ src
32
+ end
33
+
34
+ link = doc.root.element('summary/link/text()')
35
+ return unless link
36
+
27
37
  sections = doc.root.xpath 'records/section'
28
38
 
29
39
  sections.each do |section|
30
40
 
31
- url = "%s/#%s" % [src[/^https?:\/\/[^\/]+(.*)(?=\/formatted.xml$)/,1], \
41
+ url = "%s/#%s" % [link[/^https?:\/\/[^\/]+(.*)(?=\/$)/,1], \
32
42
  section.attributes[:id]]
33
43
 
34
44
  h = @xws.scan section.element('details')
@@ -52,25 +62,45 @@ class LiveBlogIndexer
52
62
  end # /h
53
63
  end # /section
54
64
  end # /add_index
55
-
65
+
66
+ def crawl(location)
67
+
68
+ index_file location
69
+ save @wordindex_filepath
70
+ File.write @urls_index_filepath, @url_index.to_json
71
+
72
+ end
56
73
 
57
74
  def save(filepath=nil)
58
75
 
59
76
  File.write filepath, @master.to_json
60
77
  puts 'saved ' + File.basename(filepath)
61
78
 
62
- px = Polyrex.new 'words/key[word]/location[url, wordcount]'
63
-
64
- @master.each do |k,rows|
65
- px.create.key(word: k) do |create|
66
- rows.each do |k, v|
67
- create.location url: k, wordcount: v
68
- end
69
- end
79
+ end
80
+
81
+ private
82
+
83
+ def index_file(location)
84
+
85
+ puts 'indexing : ' + location.inspect
86
+ doc = Rexle.new(RXFHelper.read(location).first)
87
+ summary = doc.root.element 'summary'
88
+ return unless summary
89
+
90
+ result = add_index doc
91
+ return unless result
92
+
93
+ prev_day = summary.text 'prev_day'
94
+
95
+ if prev_day then
96
+ url = prev_day + 'formatted.xml'
97
+ @url_index[url] = {last_indexed: Time.now}
98
+ index_file(url)
70
99
  end
71
- pxfilepath = File.join(File.dirname(filepath), 'search-index.xml')
72
- px.save pxfilepath, options: {pretty: true}
73
- puts 'saved ' + File.basename(pxfilepath)
100
+ end
101
+
102
+ def save_urlsindex(filepath)
103
+
74
104
  end
75
105
 
76
- end
106
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: liveblog-indexer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -31,7 +31,7 @@ cert_chain:
31
31
  mOnsi2V1CXpq2biJtSgD7mBx4cO9FXgbK3Xnsv45ygAPo6jj4Eb34udqz+0v88Ys
32
32
  KoTqNQOniHAW2w==
33
33
  -----END CERTIFICATE-----
34
- date: 2015-11-12 00:00:00.000000000 Z
34
+ date: 2015-11-13 00:00:00.000000000 Z
35
35
  dependencies:
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: xws
@@ -54,25 +54,25 @@ dependencies:
54
54
  - !ruby/object:Gem::Version
55
55
  version: 0.1.1
56
56
  - !ruby/object:Gem::Dependency
57
- name: polyrex
57
+ name: rxfhelper
58
58
  requirement: !ruby/object:Gem::Requirement
59
59
  requirements:
60
60
  - - "~>"
61
61
  - !ruby/object:Gem::Version
62
- version: '1.0'
62
+ version: '0.2'
63
63
  - - ">="
64
64
  - !ruby/object:Gem::Version
65
- version: 1.0.11
65
+ version: 0.2.3
66
66
  type: :runtime
67
67
  prerelease: false
68
68
  version_requirements: !ruby/object:Gem::Requirement
69
69
  requirements:
70
70
  - - "~>"
71
71
  - !ruby/object:Gem::Version
72
- version: '1.0'
72
+ version: '0.2'
73
73
  - - ">="
74
74
  - !ruby/object:Gem::Version
75
- version: 1.0.11
75
+ version: 0.2.3
76
76
  description:
77
77
  email: james@r0bertson.co.uk
78
78
  executables: []
metadata.gz.sig CHANGED
Binary file