parse_date 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +8 -1
- data/.rubocop_todo.yml +15 -23
- data/README.md +121 -34
- data/lib/parse_date.rb +45 -6
- data/lib/parse_date/int_from_string.rb +209 -82
- data/lib/parse_date/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3927636e715e52bf2f38a8c0f3f384b06dc29971000c4b700a5e9f8a128c144b
|
4
|
+
data.tar.gz: 1f6b9fcb0f3bbf37f5d8417ee21c92ee19c87604394453dbe60a4f16b722e59c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c8d350764a5d813e5daa7e5abc0b17d9167b81b249322b52f181e7ea5ccba58f468883f30f6bdbd91e3a43f4da03270cd62581ea2e23ae5ac2a45261ecdf5e24
|
7
|
+
data.tar.gz: 6e421ee0b10f44f2cee021dc198cddb74a8640639b7c66ccb74f515ae891c4be6cadd47d6b5b12af1f4a9d6f3698f1fc3fe224c6be5aed03ce98eb5376c15d8f
|
data/.rubocop.yml
CHANGED
data/.rubocop_todo.yml
CHANGED
@@ -1,34 +1,33 @@
|
|
1
1
|
# This configuration was generated by
|
2
2
|
# `rubocop --auto-gen-config`
|
3
|
-
# on 2019-10-
|
3
|
+
# on 2019-10-21 14:32:14 -0700 using RuboCop version 0.74.0.
|
4
4
|
# The point is for the user to remove these configuration records
|
5
5
|
# one by one as the offenses are removed from the code base.
|
6
6
|
# Note that changes in the inspected code, or installation of new
|
7
7
|
# versions of RuboCop, may require this file to be generated again.
|
8
8
|
|
9
|
-
# Offense count:
|
10
|
-
# Cop supports --auto-correct.
|
11
|
-
Lint/StringConversionInInterpolation:
|
12
|
-
Exclude:
|
13
|
-
- 'lib/parse_date/int_from_string.rb'
|
14
|
-
|
15
|
-
# Offense count: 3
|
9
|
+
# Offense count: 5
|
16
10
|
Metrics/AbcSize:
|
17
|
-
Max:
|
11
|
+
Max: 37
|
18
12
|
|
19
|
-
# Offense count:
|
13
|
+
# Offense count: 8
|
20
14
|
# Configuration parameters: CountComments, ExcludedMethods.
|
21
15
|
# ExcludedMethods: refine
|
22
16
|
Metrics/BlockLength:
|
23
|
-
Max:
|
17
|
+
Max: 812
|
24
18
|
|
25
|
-
# Offense count:
|
19
|
+
# Offense count: 4
|
26
20
|
Metrics/CyclomaticComplexity:
|
27
|
-
Max:
|
21
|
+
Max: 12
|
28
22
|
|
29
23
|
# Offense count: 1
|
24
|
+
# Configuration parameters: CountComments.
|
25
|
+
Metrics/ModuleLength:
|
26
|
+
Max: 168
|
27
|
+
|
28
|
+
# Offense count: 4
|
30
29
|
Metrics/PerceivedComplexity:
|
31
|
-
Max:
|
30
|
+
Max: 12
|
32
31
|
|
33
32
|
# Offense count: 2
|
34
33
|
Style/Documentation:
|
@@ -53,18 +52,11 @@ Style/RegexpLiteral:
|
|
53
52
|
Exclude:
|
54
53
|
- 'lib/parse_date/int_from_string.rb'
|
55
54
|
|
56
|
-
# Offense count:
|
55
|
+
# Offense count: 3
|
57
56
|
# Cop supports --auto-correct.
|
58
57
|
# Configuration parameters: EnforcedStyleForMultiline.
|
59
58
|
# SupportedStylesForMultiline: comma, consistent_comma, no_comma
|
60
59
|
Style/TrailingCommaInArrayLiteral:
|
61
60
|
Exclude:
|
62
61
|
- 'spec/parse_date/int_from_string_spec.rb'
|
63
|
-
|
64
|
-
# Offense count: 1
|
65
|
-
# Cop supports --auto-correct.
|
66
|
-
# Configuration parameters: EnforcedStyleForMultiline.
|
67
|
-
# SupportedStylesForMultiline: comma, consistent_comma, no_comma
|
68
|
-
Style/TrailingCommaInHashLiteral:
|
69
|
-
Exclude:
|
70
|
-
- 'spec/parse_date/int_from_string_spec.rb'
|
62
|
+
- 'spec/parse_date_spec.rb'
|
data/README.md
CHANGED
@@ -30,40 +30,127 @@ ParseDate has class methods for date string parsing.
|
|
30
30
|
```
|
31
31
|
require 'parse_date'
|
32
32
|
|
33
|
-
ParseDate.
|
34
|
-
ParseDate.
|
35
|
-
ParseDate.
|
36
|
-
ParseDate.
|
37
|
-
ParseDate.
|
38
|
-
ParseDate.
|
39
|
-
ParseDate.
|
40
|
-
ParseDate.
|
41
|
-
ParseDate.
|
42
|
-
ParseDate.
|
43
|
-
ParseDate.
|
44
|
-
ParseDate.
|
45
|
-
|
46
|
-
ParseDate.
|
47
|
-
ParseDate.
|
48
|
-
ParseDate.
|
49
|
-
ParseDate.
|
50
|
-
ParseDate.
|
51
|
-
ParseDate.
|
52
|
-
|
53
|
-
ParseDate.
|
54
|
-
ParseDate.
|
55
|
-
ParseDate.
|
56
|
-
ParseDate.
|
57
|
-
ParseDate.
|
58
|
-
ParseDate.
|
59
|
-
ParseDate.
|
60
|
-
ParseDate.
|
61
|
-
ParseDate.
|
62
|
-
ParseDate.
|
63
|
-
ParseDate.
|
64
|
-
ParseDate.
|
65
|
-
ParseDate.
|
66
|
-
ParseDate.
|
33
|
+
ParseDate.parse_range('12/25/00') # [2000]
|
34
|
+
ParseDate.parse_range('5-1-25') # [1925]
|
35
|
+
ParseDate.parse_range('1666 B.C.') # [-1666]
|
36
|
+
ParseDate.parse_range('-914') # [-914]
|
37
|
+
ParseDate.parse_range('[c1926]') # [1926]
|
38
|
+
ParseDate.parse_range('ca. 1558') # [1558]
|
39
|
+
ParseDate.parse_range('195-') # (1950..1959).to_a
|
40
|
+
ParseDate.parse_range('199u') # (1990..1999).to_a
|
41
|
+
ParseDate.parse_range('197?') # (1970..1979).to_a
|
42
|
+
ParseDate.parse_range('196x') # (1960..1969).to_a
|
43
|
+
ParseDate.parse_range('18th century CE') # (1700..1799).to_a
|
44
|
+
ParseDate.parse_range('17uu') # (1700..1799).to_a
|
45
|
+
ParseDate.parse_range('between 1694 and 1799') # (1694..1799).to_a
|
46
|
+
ParseDate.parse_range('between 1 and 5') # (1..5).to_a
|
47
|
+
ParseDate.parse_range('between 300 and 150 B.C.') # (-300..-150).to_a
|
48
|
+
ParseDate.parse_range('-5 - 3') # (-5..3).to_a
|
49
|
+
ParseDate.parse_range('1496-1499') # (1496..1499).to_a
|
50
|
+
ParseDate.parse_range('1750?-1867') # (1750..1867).to_a
|
51
|
+
ParseDate.parse_range('17--?-18--?') # (1700..1899).to_a
|
52
|
+
ParseDate.parse_range('1835 or 1836') # [1835, 1836]
|
53
|
+
ParseDate.parse_range('17-- or 18--?') # (1700..1899).to_a
|
54
|
+
ParseDate.parse_range('-2 or 1?') # (-2..1).to_a
|
55
|
+
ParseDate.parse_range('17th or 18th century?') # (1600..1799).to_a
|
56
|
+
ParseDate.parse_range('ca. 5th–6th century A.D.') # (400..599).to_a
|
57
|
+
ParseDate.parse_range('ca. 9th–8th century B.C.') # (-999..-800).to_a
|
58
|
+
ParseDate.parse_range('ca. 13th–12th century B.C.') # (-1399..-1200).to_a
|
59
|
+
ParseDate.parse_range('5th century B.C.') # (-599..-500).to_a
|
60
|
+
ParseDate.parse_range('1975 - 1905') # last year > first year, raises error
|
61
|
+
ParseDate.parse_range('-100 - -150') # last year > first year, raises error
|
62
|
+
ParseDate.parse_range('1975 or 1905') # last year > first year, raises error
|
63
|
+
ParseDate.parse_range('2050') # year later than current year + 1, raises error
|
64
|
+
ParseDate.parse_range('12345') # year later than current year + 1, raises error
|
65
|
+
ParseDate.parse_range('random text') # can't parse years, raises error
|
66
|
+
ParseDate.parse_range(nil) # can't parse years, raises error
|
67
|
+
|
68
|
+
ParseDate.earliest_year('12/25/00') # 2000
|
69
|
+
ParseDate.earliest_year('5-1-21') # 1921
|
70
|
+
ParseDate.earliest_year('1666 B.C.') # -1666
|
71
|
+
ParseDate.earliest_year('-914') # -914
|
72
|
+
ParseDate.earliest_year('[c1926]') # 1926
|
73
|
+
ParseDate.earliest_year('ca. 1558') # 1558
|
74
|
+
ParseDate.earliest_year('195-') # 1950
|
75
|
+
ParseDate.earliest_year('199u') # 1990
|
76
|
+
ParseDate.earliest_year('197?') # 1970
|
77
|
+
ParseDate.earliest_year('196x') # 1960
|
78
|
+
ParseDate.earliest_year('18th century CE') # 1700
|
79
|
+
ParseDate.earliest_year('17uu') # 1700
|
80
|
+
ParseDate.earliest_year('between 1694 and 1799') # 1694
|
81
|
+
ParseDate.earliest_year('between 1 and 5') # 1
|
82
|
+
ParseDate.earliest_year('between 300 and 150 B.C.') # -300
|
83
|
+
ParseDate.earliest_year('1496-1499') # 1496
|
84
|
+
ParseDate.earliest_year('1750?-1867') # 1750
|
85
|
+
ParseDate.earliest_year('17--?-18--?') # 1700
|
86
|
+
ParseDate.earliest_year('1835 or 1836') # 1835
|
87
|
+
ParseDate.earliest_year('17-- or 18--?') # 1700
|
88
|
+
ParseDate.earliest_year('17th or 18th century?') # 1600
|
89
|
+
ParseDate.earliest_year('ca. 5th–6th century A.D.') # 400
|
90
|
+
ParseDate.earliest_year('ca. 9th–8th century B.C.') # -999
|
91
|
+
ParseDate.earliest_year('ca. 13th–12th century B.C.') # -1399
|
92
|
+
ParseDate.earliest_year('5th century B.C.') # -599
|
93
|
+
|
94
|
+
ParseDate.latest_year('195-') # 1959
|
95
|
+
ParseDate.latest_year('199u') # 1999
|
96
|
+
ParseDate.latest_year('197?') # 1979
|
97
|
+
ParseDate.latest_year('196x') # 1969
|
98
|
+
ParseDate.latest_year('18th century CE') # 1799
|
99
|
+
ParseDate.latest_year('17uu') # 1799
|
100
|
+
ParseDate.latest_year('between 1694 and 1799') # 1799
|
101
|
+
ParseDate.latest_year('between 1 and 5') # 5
|
102
|
+
ParseDate.latest_year('between 300 and 150 B.C.') # -150
|
103
|
+
ParseDate.latest_year('1496-1499') # 1499
|
104
|
+
ParseDate.latest_year('1750?-1867') # 1867
|
105
|
+
ParseDate.latest_year('17--?-18--?') # 1899
|
106
|
+
ParseDate.latest_year('1757-58') # 1758
|
107
|
+
ParseDate.latest_year('1975-05') # 1975 (range invalid)
|
108
|
+
ParseDate.latest_year('1835 or 1836') # 1836
|
109
|
+
ParseDate.latest_year('17-- or 18--?') # 1899
|
110
|
+
ParseDate.latest_year('17th or 18th century?') # 1799
|
111
|
+
ParseDate.latest_year('ca. 5th–6th century A.D.') # 599
|
112
|
+
ParseDate.latest_year('ca. 9th–8th century B.C.') # -800
|
113
|
+
ParseDate.latest_year('ca. 13th–12th century B.C.') # -1200
|
114
|
+
ParseDate.latest_year('5th century B.C.') # -500
|
115
|
+
ParseDate.latest_year('-5 - 3') # 3,
|
116
|
+
|
117
|
+
ParseDate.range_array('1993', '1995') # [1993, 1994, 1995]
|
118
|
+
ParseDate.range_array(1993, 1995) # [1993, 1994, 1995]
|
119
|
+
ParseDate.range_array(0, '0001') # [0, 1]
|
120
|
+
ParseDate.range_array('-0003', '0000') # [-3, -2, -1, 0]
|
121
|
+
ParseDate.range_array(-1, 1) # [-1, 0, 1]
|
122
|
+
ParseDate.range_array(15, 15) # [15]
|
123
|
+
ParseDate.range_array(-100, '-99') # [-100, -99]
|
124
|
+
ParseDate.range_array('98', 101) # [98, 99, 100, 101]
|
125
|
+
ParseDate.range_array('word1', 'word2') # throws ArgumentError
|
126
|
+
ParseDate.range_array('1993', 1990) # throws StandardError - bad range
|
127
|
+
ParseDate.range_array('12345', 12345) # throws StandardError - bad range
|
128
|
+
|
129
|
+
ParseDate.year_range_valid?(1975, 1905) # false, first year > last year
|
130
|
+
ParseDate.year_range_valid?(-100, -150) # false, first year > last year
|
131
|
+
ParseDate.year_range_valid?(2050, 2070) # false, year later than current year + 1
|
132
|
+
ParseDate.year_range_valid?(2007, 2050) # false, year later than current year + 1
|
133
|
+
ParseDate.year_range_valid?(2007, 2009) # true
|
134
|
+
ParseDate.year_range_valid?(75, 150) # true
|
135
|
+
ParseDate.year_range_valid?(-3, 2) # true
|
136
|
+
ParseDate.year_range_valid?(-100, -50) # true
|
137
|
+
ParseDate.year_range_valid?(-1500, -1499) # true
|
138
|
+
ParseDate.year_range_valid?(-15000, -14999) # true
|
139
|
+
|
140
|
+
ParseDate.year_int_valid?(0) # true
|
141
|
+
ParseDate.year_int_valid?(5) # true
|
142
|
+
ParseDate.year_int_valid?(33) # true
|
143
|
+
ParseDate.year_int_valid?(150) # true
|
144
|
+
ParseDate.year_int_valid?(2019) # true
|
145
|
+
ParseDate.year_int_valid?(Date.today.year + 1) # true
|
146
|
+
ParseDate.year_int_valid?(-3) # true
|
147
|
+
ParseDate.year_int_valid?(-35) # true
|
148
|
+
ParseDate.year_int_valid?(-999) # true
|
149
|
+
ParseDate.year_int_valid?(-1666) # false - four digit negative years not considered valid here
|
150
|
+
ParseDate.year_int_valid?(165x) # false
|
151
|
+
ParseDate.year_int_valid?(198-) # false
|
152
|
+
ParseDate.year_int_valid?('random text') # false
|
153
|
+
ParseDate.year_int_valid?(nil) # false
|
67
154
|
```
|
68
155
|
|
69
156
|
## Development
|
data/lib/parse_date.rb
CHANGED
@@ -26,15 +26,54 @@ class ParseDate
|
|
26
26
|
extend ParseDate::IntFromString
|
27
27
|
|
28
28
|
# class method delegation
|
29
|
-
def self.earliest_year(
|
30
|
-
ParseDate::IntFromString.earliest_year(
|
29
|
+
def self.earliest_year(date_str)
|
30
|
+
ParseDate::IntFromString.earliest_year(date_str)
|
31
31
|
end
|
32
32
|
|
33
|
-
def self.latest_year(
|
34
|
-
ParseDate::IntFromString.latest_year(
|
33
|
+
def self.latest_year(date_str)
|
34
|
+
ParseDate::IntFromString.latest_year(date_str)
|
35
35
|
end
|
36
36
|
|
37
|
-
def self.year_int_valid?(
|
38
|
-
ParseDate::IntFromString.year_int_valid?(
|
37
|
+
def self.year_int_valid?(date_str)
|
38
|
+
ParseDate::IntFromString.year_int_valid?(date_str)
|
39
|
+
end
|
40
|
+
|
41
|
+
# @return [Array] array of Integer year values from earliest year to latest year, inclusive
|
42
|
+
def self.parse_range(date_str)
|
43
|
+
first = earliest_year(date_str)
|
44
|
+
last = latest_year(date_str)
|
45
|
+
raise ParseDate::Error, "Unable to parse range from '#{date_str}'" unless year_range_valid?(first, last)
|
46
|
+
|
47
|
+
range_array(first, last)
|
48
|
+
rescue StandardError => e
|
49
|
+
raise ParseDate::Error, "Unable to parse range from '#{date_str}': #{e.message}"
|
50
|
+
end
|
51
|
+
|
52
|
+
# true if:
|
53
|
+
# both years are not newer than (current year + 1)
|
54
|
+
# first_year <= last_year
|
55
|
+
# false otherwise
|
56
|
+
def self.year_range_valid?(first_year, last_year)
|
57
|
+
upper_bound = Date.today.year + 2
|
58
|
+
return false if first_year > upper_bound || last_year > upper_bound
|
59
|
+
return false if first_year > last_year
|
60
|
+
|
61
|
+
true
|
62
|
+
end
|
63
|
+
|
64
|
+
# @param [Integer, String] first_year, expecting integer or parseable string for .to_i
|
65
|
+
# @param [Integer, String] last_year, expecting integer or parseable string for .to_i
|
66
|
+
# @return [Array] array of Integer year values from first to last, inclusive
|
67
|
+
def self.range_array(first_year, last_year)
|
68
|
+
first_year = first_year.to_i if first_year.is_a?(String) && first_year.match?(/^-?\d+$/)
|
69
|
+
last_year = last_year.to_i if last_year.is_a?(String) && last_year.match?(/^-?\d+$/)
|
70
|
+
|
71
|
+
return [] unless last_year || first_year
|
72
|
+
return [first_year] if last_year.nil? && first_year
|
73
|
+
return [last_year] if first_year.nil? && last_year
|
74
|
+
raise(StandardError, "unable to create year range array from #{first_year}, #{last_year}") unless
|
75
|
+
year_range_valid?(first_year, last_year)
|
76
|
+
|
77
|
+
Range.new(first_year, last_year).to_a
|
39
78
|
end
|
40
79
|
end
|
@@ -7,7 +7,7 @@ class ParseDate
|
|
7
7
|
# Parse (Year) Integers from Date Strings
|
8
8
|
module IntFromString
|
9
9
|
|
10
|
-
# earliest year as Integer if we can parse one from
|
10
|
+
# earliest year as Integer if we can parse one from date_str
|
11
11
|
# e.g. if 17uu, result is 1700
|
12
12
|
# NOTE: if we have a x/x/yy or x-x-yy pattern (the only 2 digit year patterns
|
13
13
|
# found in our actual date strings in stanford-mods records), then
|
@@ -15,25 +15,30 @@ class ParseDate
|
|
15
15
|
# 1/1/17 -> 2017
|
16
16
|
# 1/1/27 -> 1927
|
17
17
|
# @return [Integer, nil] Integer year if we could parse one, nil otherwise
|
18
|
-
def self.earliest_year(
|
19
|
-
return
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
result ||= ParseDate.send(:
|
18
|
+
def self.earliest_year(date_str)
|
19
|
+
return unless date_str && !date_str.empty?
|
20
|
+
return if date_str == '0000-00-00' # shpc collection has these useless dates
|
21
|
+
|
22
|
+
# B.C. first (match longest string first)
|
23
|
+
return ParseDate.send(:earliest_century_bc, date_str) if date_str.match(YY_YY_CENTURY_BC_REGEX)
|
24
|
+
return ParseDate.send(:between_bc_earliest_year, date_str) if date_str.match(BETWEEN_Yn_AND_Yn_BC_REGEX)
|
25
|
+
return ParseDate.send(:year_int_for_bc, date_str) if date_str.match(YEAR_BC_REGEX)
|
26
|
+
|
27
|
+
result ||= ParseDate.send(:between_earliest_year, date_str)
|
28
|
+
result ||= ParseDate.send(:first_four_digits, date_str)
|
29
|
+
result ||= ParseDate.send(:year_from_mm_dd_yy, date_str)
|
30
|
+
result ||= ParseDate.send(:first_year_for_decade, date_str) # 198x or 201x
|
31
|
+
result ||= ParseDate.send(:first_year_for_century, date_str) # includes BC
|
32
|
+
result ||= ParseDate.send(:year_for_early_numeric, date_str)
|
28
33
|
unless result
|
29
34
|
# try removing brackets between digits in case we have 169[5] or [18]91
|
30
|
-
no_brackets = ParseDate.send(:remove_brackets,
|
35
|
+
no_brackets = ParseDate.send(:remove_brackets, date_str)
|
31
36
|
return earliest_year(no_brackets) if no_brackets
|
32
37
|
end
|
33
38
|
result.to_i if result && year_int_valid?(result.to_i)
|
34
39
|
end
|
35
40
|
|
36
|
-
# latest year as Integer if we can parse one from
|
41
|
+
# latest year as Integer if we can parse one from date_str
|
37
42
|
# e.g. if 17uu, result is 1799
|
38
43
|
# NOTE: if we have a x/x/yy or x-x-yy pattern (the only 2 digit year patterns
|
39
44
|
# found in our actual date strings in stanford-mods records), then
|
@@ -41,29 +46,36 @@ class ParseDate
|
|
41
46
|
# 1/1/17 -> 2017
|
42
47
|
# 1/1/27 -> 1927
|
43
48
|
# @return [Integer, nil] Integer year if we could parse one, nil otherwise
|
44
|
-
def self.latest_year(
|
45
|
-
return
|
46
|
-
|
47
|
-
|
48
|
-
#
|
49
|
-
return ParseDate.send(:
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
result ||= ParseDate.send(:
|
55
|
-
|
56
|
-
result ||= ParseDate.send(:
|
57
|
-
result ||= ParseDate.send(:
|
49
|
+
def self.latest_year(date_str)
|
50
|
+
return unless date_str && !date_str.empty?
|
51
|
+
return if date_str == '0000-00-00' # shpc collection has these useless dates
|
52
|
+
|
53
|
+
# B.C. first (match longest string first)
|
54
|
+
return ParseDate.send(:last_year_mult_centuries_bc, date_str) if date_str.match(YY_YY_CENTURY_BC_REGEX)
|
55
|
+
return ParseDate.send(:between_bc_latest_year, date_str) if date_str.match(BETWEEN_Yn_AND_Yn_BC_REGEX)
|
56
|
+
return ParseDate.send(:last_year_for_bc_century, date_str) if date_str.match(BC_CENTURY_REGEX)
|
57
|
+
return ParseDate.send(:year_int_for_bc, date_str) if date_str.match(BC_REGEX)
|
58
|
+
|
59
|
+
result ||= ParseDate.send(:between_latest_year, date_str)
|
60
|
+
result ||= ParseDate.send(:hyphen_4digit_latest_year, date_str)
|
61
|
+
result ||= ParseDate.send(:hyphen_2digit_latest_year, date_str)
|
62
|
+
result ||= ParseDate.send(:yyuu_after_hyphen, date_str)
|
63
|
+
result ||= ParseDate.send(:year_after_or, date_str)
|
64
|
+
result ||= ParseDate.send(:first_four_digits, date_str)
|
65
|
+
result ||= ParseDate.send(:year_from_mm_dd_yy, date_str)
|
66
|
+
result ||= ParseDate.send(:last_year_for_decade, date_str) # 198x or 201x
|
67
|
+
result ||= ParseDate.send(:last_year_mult_centuries, date_str) # nth-nth century
|
68
|
+
result ||= ParseDate.send(:last_year_for_century, date_str)
|
69
|
+
result ||= ParseDate.send(:last_year_for_early_numeric, date_str)
|
58
70
|
unless result
|
59
71
|
# try removing brackets between digits in case we have 169[5] or [18]91
|
60
|
-
no_brackets = ParseDate.send(:remove_brackets,
|
72
|
+
no_brackets = ParseDate.send(:remove_brackets, date_str)
|
61
73
|
return earliest_year(no_brackets) if no_brackets
|
62
74
|
end
|
63
75
|
result.to_i if result && year_int_valid?(result.to_i)
|
64
76
|
end
|
65
77
|
|
66
|
-
# true if the year is between -999 and (current year +
|
78
|
+
# true if the year is between -999 and (current year + 1), inclusive
|
67
79
|
# @return [Boolean] true if the year is between -999 and (current year + 1); false otherwise
|
68
80
|
def self.year_int_valid?(year)
|
69
81
|
return false unless year.is_a? Integer
|
@@ -73,18 +85,94 @@ class ParseDate
|
|
73
85
|
|
74
86
|
protected
|
75
87
|
|
76
|
-
|
88
|
+
REGEX_OPTS = Regexp::IGNORECASE | Regexp::MULTILINE
|
89
|
+
BC_REGEX = Regexp.new(/\s*B\.?\s*C\.?/im)
|
90
|
+
BRACKETS_BETWEEN_DIGITS_REGEX = Regexp.new('\d[' + Regexp.escape('[]') + ']\d')
|
77
91
|
|
78
92
|
# removes brackets between digits such as 169[5] or [18]91
|
79
|
-
def remove_brackets(
|
80
|
-
|
93
|
+
def remove_brackets(date_str)
|
94
|
+
date_str.delete('[]') if date_str.match(BRACKETS_BETWEEN_DIGITS_REGEX)
|
95
|
+
end
|
96
|
+
|
97
|
+
YYYY_HYPHEN_YYYY_REGEX = Regexp.new(/(?<first>\d{4})\??\s*-\s*(?<last>\d{4})\??/m)
|
98
|
+
|
99
|
+
# Integer value for latest year if we have "yyyy-yyyy" pattern
|
100
|
+
# @return [Integer, nil] yyyy if date_str matches pattern; nil otherwise
|
101
|
+
def hyphen_4digit_latest_year(date_str)
|
102
|
+
Regexp.last_match(:last).to_i if date_str.match(YYYY_HYPHEN_YYYY_REGEX)
|
103
|
+
end
|
104
|
+
|
105
|
+
YYYY_HYPHEN_YY_REGEX = Regexp.new(/(?<first>\d{4})\??\s*-\s*(?<last>\d{2})\??([^-0-9].*)?$/)
|
106
|
+
|
107
|
+
# Integer value for latest year if we have "yyyy-yy" pattern
|
108
|
+
# @return [Integer, nil] yyyy if date_str matches pattern; nil otherwise
|
109
|
+
def hyphen_2digit_latest_year(date_str)
|
110
|
+
matches = date_str.match(YYYY_HYPHEN_YY_REGEX)
|
111
|
+
return unless matches
|
112
|
+
|
113
|
+
first = Regexp.last_match(:first)
|
114
|
+
century = first[0, 2]
|
115
|
+
last = "#{century}#{Regexp.last_match(:last)}"
|
116
|
+
last.to_i if ParseDate.year_range_valid?(first.to_i, last.to_i)
|
117
|
+
end
|
118
|
+
|
119
|
+
YYUU = '\\d{1,2}[u\\-]{2}'
|
120
|
+
YYuu_HYPHEN_YYuu_REGEX = Regexp.new("(?<first>#{YYUU})\\??\\s*-\\s*(?<last>#{YYUU})\\??([^u\\-]|$)??", REGEX_OPTS)
|
121
|
+
|
122
|
+
# Integer value for latest year if we have "yyuu-yyuu" pattern
|
123
|
+
# @return [Integer, nil] yyyy if date_str matches pattern; nil otherwise
|
124
|
+
def yyuu_after_hyphen(date_str)
|
125
|
+
last_year_for_century(Regexp.last_match(:last)).to_i if date_str.match(YYuu_HYPHEN_YYuu_REGEX)
|
126
|
+
end
|
127
|
+
|
128
|
+
YYXX = '\\d{1,2}[u\\-\\d]{2}'
|
129
|
+
YExx_OR_YExx_REGEX = Regexp.new("(?<first>#{YYXX})\\??\\s*or\\s*(?<last>#{YYXX})\\??([^u\\-]|$)??", REGEX_OPTS)
|
130
|
+
|
131
|
+
# Integer value for latest year if we have "yyyy or yyyy" pattern
|
132
|
+
# @return [Integer, nil] yyyy if date_str matches pattern; nil otherwise
|
133
|
+
def year_after_or(date_str)
|
134
|
+
latest_year(Regexp.last_match(:last)).to_i if date_str.match(YExx_OR_YExx_REGEX)
|
135
|
+
end
|
136
|
+
|
137
|
+
# NOTE: some actual data seemed to have a diff hyphen char. (slightly longer)
|
138
|
+
YY_YY_CENTURY_REGEX = Regexp.new(/(?<first>\d{1,2})[a-z]{2}?\s*(-|–|or)\s*(?<last>\d{1,2})[a-z]{2}?\s+centur.*/im)
|
139
|
+
|
140
|
+
# Integer value for latest year if we have nth-nth century pattern
|
141
|
+
# @return [Integer, nil] yyyy if date_str matches pattern; nil otherwise
|
142
|
+
def last_year_mult_centuries(date_str)
|
143
|
+
matches = date_str.match(YY_YY_CENTURY_REGEX)
|
144
|
+
return unless matches
|
145
|
+
|
146
|
+
nth = Regexp.last_match(:last).to_i
|
147
|
+
(nth - 1) * 100 + 99
|
148
|
+
end
|
149
|
+
|
150
|
+
YY_YY_CENTURY_BC_REGEX = Regexp.new("#{YY_YY_CENTURY_REGEX}#{BC_REGEX}", REGEX_OPTS)
|
151
|
+
|
152
|
+
# Integer value for earliest year if we have nth-nth century BC pattern
|
153
|
+
# @return [Integer, nil] yyyy if date_str matches pattern; nil otherwise
|
154
|
+
def earliest_century_bc(date_str)
|
155
|
+
matches = date_str.match(YY_YY_CENTURY_BC_REGEX)
|
156
|
+
return unless matches
|
157
|
+
|
158
|
+
nth = Regexp.last_match(:first).to_i
|
159
|
+
nth * -100 - 99
|
160
|
+
end
|
161
|
+
|
162
|
+
# Integer value for latest year if we have nth-nth century BC pattern
|
163
|
+
# @return [Integer, nil] yyyy if date_str matches pattern; nil otherwise
|
164
|
+
def last_year_mult_centuries_bc(date_str)
|
165
|
+
matches = date_str.match(YY_YY_CENTURY_BC_REGEX)
|
166
|
+
return unless matches
|
167
|
+
|
168
|
+
nth = Regexp.last_match(:last).to_i
|
169
|
+
nth * -100
|
81
170
|
end
|
82
171
|
|
83
|
-
# looks for 4 consecutive digits in
|
84
|
-
# @return [String, nil] 4 digit year (e.g. 1865, 0950) if
|
85
|
-
def first_four_digits(
|
86
|
-
|
87
|
-
matches&.to_s
|
172
|
+
# looks for 4 consecutive digits in date_str and returns first occurrence if found
|
173
|
+
# @return [String, nil] 4 digit year (e.g. 1865, 0950) if date_str has yyyy, nil otherwise
|
174
|
+
def first_four_digits(date_str)
|
175
|
+
Regexp.last_match(1) if date_str.match(/(\d{4})([^\d]|$)/im)
|
88
176
|
end
|
89
177
|
|
90
178
|
# returns 4 digit year as String if we have a x/x/yy or x-x-yy pattern
|
@@ -92,16 +180,14 @@ class ParseDate
|
|
92
180
|
# we use 20 as century digits unless it is greater than current year:
|
93
181
|
# 1/1/17 -> 2017
|
94
182
|
# 1/1/27 -> 1927
|
95
|
-
# @return [String, nil] 4 digit year (e.g. 1865, 0950) if
|
96
|
-
def year_from_mm_dd_yy(
|
97
|
-
|
98
|
-
|
99
|
-
slash_matches = orig_date_str.match(/\d{1,2}\/\d{1,2}\/\d{2}/)
|
183
|
+
# @return [String, nil] 4 digit year (e.g. 1865, 0950) if date_str matches pattern, nil otherwise
|
184
|
+
def year_from_mm_dd_yy(date_str)
|
185
|
+
slash_matches = date_str.match(/\d{1,2}\/\d{1,2}\/\d{2}/)
|
100
186
|
if slash_matches
|
101
|
-
date_obj = Date.strptime(
|
187
|
+
date_obj = Date.strptime(date_str, '%m/%d/%y')
|
102
188
|
else
|
103
|
-
hyphen_matches =
|
104
|
-
date_obj = Date.strptime(
|
189
|
+
hyphen_matches = date_str.match(/\d{1,2}-\d{1,2}-\d{2}/)
|
190
|
+
date_obj = Date.strptime(date_str, '%m-%d-%y') if hyphen_matches
|
105
191
|
end
|
106
192
|
date_obj = Date.new(date_obj.year - 100, date_obj.month, date_obj.mday) if date_obj && date_obj > Date.today
|
107
193
|
date_obj.year.to_s if date_obj
|
@@ -109,66 +195,107 @@ class ParseDate
|
|
109
195
|
nil # explicitly want nil if date won't parse
|
110
196
|
end
|
111
197
|
|
112
|
-
|
198
|
+
DECADE_4CHAR_REGEX = Regexp.new('(^|\D)\d{3}[u\-?x]', REGEX_OPTS)
|
113
199
|
|
114
200
|
# first year of decade (as String) if we have: yyyu, yyy-, yyy? or yyyx pattern
|
115
201
|
# note that these are the only decade patterns found in our actual date strings in MODS records
|
116
|
-
# @return [String, nil] 4 digit year (e.g. 1860, 1950) if
|
117
|
-
def first_year_for_decade(
|
118
|
-
decade_matches =
|
202
|
+
# @return [String, nil] 4 digit year (e.g. 1860, 1950) if date_str matches pattern, nil otherwise
|
203
|
+
def first_year_for_decade(date_str)
|
204
|
+
decade_matches = date_str.match(DECADE_4CHAR_REGEX)
|
119
205
|
changed_to_zero = decade_matches.to_s.tr('u\-?x', '0') if decade_matches
|
120
206
|
ParseDate.first_four_digits(changed_to_zero) if changed_to_zero
|
121
207
|
end
|
122
208
|
|
123
209
|
# last year of decade (as String) if we have: yyyu, yyy-, yyy? or yyyx pattern
|
124
210
|
# note that these are the only decade patterns found in our actual date strings in MODS records
|
125
|
-
# @return [String, nil] 4 digit year (e.g.
|
126
|
-
def last_year_for_decade(
|
127
|
-
decade_matches =
|
211
|
+
# @return [String, nil] 4 digit year (e.g. 1869, 1959) if date_str matches pattern, nil otherwise
|
212
|
+
def last_year_for_decade(date_str)
|
213
|
+
decade_matches = date_str.match(DECADE_4CHAR_REGEX)
|
128
214
|
changed_to_nine = decade_matches.to_s.tr('u\-?x', '9') if decade_matches
|
129
215
|
ParseDate.first_four_digits(changed_to_nine) if changed_to_nine
|
130
216
|
end
|
131
217
|
|
132
|
-
|
133
|
-
|
218
|
+
CENTURY_WORD_REGEX = Regexp.new('(\d{1,2})[a-z]{2}?\s*century', REGEX_OPTS)
|
219
|
+
CENTURY_4CHAR_REGEX = Regexp.new('(\d{1,2})[u\-]{2}([^u\-]|$)', REGEX_OPTS)
|
220
|
+
BC_CENTURY_REGEX = Regexp.new("#{CENTURY_WORD_REGEX}\\s+#{BC_REGEX}", REGEX_OPTS)
|
221
|
+
|
222
|
+
# first year of century if we have: yyuu, yy--, yy--? or xxth century pattern; handles B.C.
|
223
|
+
# @return [Integer, nil] yy00 if date_str matches pattern, nil otherwise
|
224
|
+
def first_year_for_century(date_str)
|
225
|
+
return Regexp.last_match(1).to_i * -100 - 99 if date_str.match(BC_CENTURY_REGEX)
|
226
|
+
return Regexp.last_match(1).to_i * 100 if date_str.match(CENTURY_4CHAR_REGEX)
|
227
|
+
return (Regexp.last_match(:first).to_i - 1) * 100 if date_str.match(YY_YY_CENTURY_REGEX)
|
228
|
+
return (Regexp.last_match(1).to_i - 1) * 100 if date_str.match(CENTURY_WORD_REGEX)
|
229
|
+
end
|
230
|
+
|
231
|
+
# last year of century if we have: yyuu, yy--, yy--? or xxth century pattern
|
232
|
+
# @return [Integer, nil] yy99 if date_str matches pattern, nil otherwise; also nil if B.C. in pattern
|
233
|
+
def last_year_for_century(date_str)
|
234
|
+
return Regexp.last_match(1).to_i * 100 + 99 if date_str.match(CENTURY_4CHAR_REGEX)
|
235
|
+
return (Regexp.last_match(1).to_i - 1) * 100 + 99 if date_str.match(CENTURY_WORD_REGEX)
|
236
|
+
end
|
237
|
+
|
238
|
+
# last year of century (as String) if we have: nth century BC
|
239
|
+
# @return [Integer, nil] yy99 if date_str matches pattern, nil otherwise; also nil if B.C. in pattern
|
240
|
+
def last_year_for_bc_century(date_str)
|
241
|
+
Regexp.last_match(1).to_i * -100 if date_str.match(BC_CENTURY_REGEX)
|
242
|
+
end
|
243
|
+
|
244
|
+
BETWEEN_Yn_AND_Yn_REGEX = Regexp.new(/between\s+(?<first>\d{1,4})\??\s+and\s+(?<last>\d{1,4})\??/im)
|
134
245
|
|
135
|
-
#
|
136
|
-
#
|
137
|
-
# @return [
|
138
|
-
def
|
139
|
-
|
140
|
-
return if orig_date_str =~ /B\.C\./
|
141
|
-
return "#{Regexp.last_match(1)}00" if orig_date_str.match(CENTURY_4CHAR_REGEXP)
|
142
|
-
return "#{(Regexp.last_match(1).to_i - 1).to_s}00" if orig_date_str.match(CENTURY_WORD_REGEXP)
|
246
|
+
# Integer value for earliest if we have "between y and y" pattern
|
247
|
+
# NOTE: must match for BC first with between_bc_earliest_year
|
248
|
+
# @return [Integer, nil] year if date_str matches pattern; nil otherwise
|
249
|
+
def between_earliest_year(date_str)
|
250
|
+
Regexp.last_match(:first).to_i if date_str.match(BETWEEN_Yn_AND_Yn_REGEX)
|
143
251
|
end
|
144
252
|
|
145
|
-
#
|
146
|
-
#
|
147
|
-
# @return [
|
148
|
-
def
|
149
|
-
|
150
|
-
|
151
|
-
|
253
|
+
# Integer value for latest year if we have "between y and y" pattern
|
254
|
+
# NOTE: must match for BC first with between_bc_latest_year
|
255
|
+
# @return [Integer, nil] year if date_str matches pattern; nil otherwise
|
256
|
+
def between_latest_year(date_str)
|
257
|
+
Regexp.last_match(:last).to_i if date_str.match(BETWEEN_Yn_AND_Yn_REGEX)
|
258
|
+
end
|
259
|
+
|
260
|
+
YEAR_BC_REGEX = Regexp.new("(\\d{1,4})#{BC_REGEX}", REGEX_OPTS)
|
152
261
|
|
153
|
-
|
154
|
-
|
262
|
+
# Integer value for B.C. if we have B.C. pattern
|
263
|
+
# @return [Integer, nil] -ddd if B.C. in pattern; nil otherwise
|
264
|
+
def year_int_for_bc(date_str)
|
265
|
+
"-#{Regexp.last_match(1)}".to_i if date_str.match(YEAR_BC_REGEX)
|
155
266
|
end
|
156
267
|
|
157
|
-
|
268
|
+
BETWEEN_Yn_AND_Yn_BC_REGEX = Regexp.new("#{BETWEEN_Yn_AND_Yn_REGEX}#{BC_REGEX}", REGEX_OPTS)
|
158
269
|
|
159
|
-
# Integer
|
160
|
-
# @return [Integer, nil]
|
161
|
-
def
|
162
|
-
|
163
|
-
"-#{Regexp.last_match(1)}".to_i if bc_matches
|
270
|
+
# Integer value for earliest year if we have "between y and y B.C." pattern
|
271
|
+
# @return [Integer, nil] -ddd if date_str matches pattern; nil otherwise
|
272
|
+
def between_bc_earliest_year(date_str)
|
273
|
+
"-#{Regexp.last_match(:first)}".to_i if date_str.match(BETWEEN_Yn_AND_Yn_BC_REGEX)
|
164
274
|
end
|
165
275
|
|
166
|
-
|
276
|
+
# Integer value for latest year if we have "between y and y B.C." pattern
|
277
|
+
# @return [Integer, nil] -ddd if date_str matches pattern; nil otherwise
|
278
|
+
def between_bc_latest_year(date_str)
|
279
|
+
"-#{Regexp.last_match(:last)}".to_i if date_str.match(BETWEEN_Yn_AND_Yn_BC_REGEX)
|
280
|
+
end
|
281
|
+
|
282
|
+
EARLY_NUMERIC_REGEX = Regexp.new('^\-?\d{1,3}([^\du\-\[]|$)', REGEX_OPTS)
|
283
|
+
|
284
|
+
# year if date_str contains yyy, yy, y, -y, -yy, -yyy, -yyyy
|
285
|
+
# @return [Integer, nil] year if date_str matches pattern; nil otherwise
|
286
|
+
def year_for_early_numeric(date_str)
|
287
|
+
date_str.to_i if date_str.match(EARLY_NUMERIC_REGEX) || date_str =~ /^-\d{4}([^\du\-\[]|$)$/
|
288
|
+
end
|
289
|
+
|
290
|
+
FIRST_LAST_EARLY_NUMERIC_REGEX =
|
291
|
+
Regexp.new(/^(?<first>\-?\d{1,3})\??\s*(-|–|or)\s*(?<last>\-?\d{1,4})\??([^\du\-\[]|$)/im)
|
292
|
+
|
293
|
+
# Integer value for latest year if we have early numeric year range or single early numeric year
|
294
|
+
# @return [Integer, nil] year if date_str matches pattern; nil otherwise
|
295
|
+
def last_year_for_early_numeric(date_str)
|
296
|
+
return Regexp.last_match(:last).to_i if date_str.match(FIRST_LAST_EARLY_NUMERIC_REGEX)
|
167
297
|
|
168
|
-
|
169
|
-
# @return [String, nil] -ddd if orig_date_str matches pattern; nil otherwise
|
170
|
-
def year_for_early_numeric(orig_date_str)
|
171
|
-
orig_date_str if orig_date_str.match(EARLY_NUMERIC) || orig_date_str =~ /^-\d{4}$/
|
298
|
+
year_for_early_numeric(date_str) # if single year, not matched above
|
172
299
|
end
|
173
300
|
end
|
174
301
|
end
|
data/lib/parse_date/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parse_date
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Naomi Dushay
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-10-
|
11
|
+
date: 2019-10-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: zeitwerk
|